# Headline processing

This notebook cleans the headlines and produces a CSV containing simplified tokens.

## Imports

In [87]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

## Setup

In [88]:
# Don't truncate text in columns

pd.set_option("max_colwidth", 0)

## Data sourcing

In [89]:
articles = pd.read_csv("./data/articles.csv")

articles.head()

Unnamed: 0,title,description,link,source
0,Top GCSE marks tumble by 4.3% and passes return to pre-Covid levels - as students in England suffer sharpest drop while Wales and Northern Ireland keep inflated grades,Hundreds of thousands of teenagers across Britain received their GCSE results today in a year when efforts have been made in England to return grading to pre-pandemic levels.,https://www.dailymail.co.uk/news/article-12440091/Top-GCSE-marks-drop-203-000-fewer-7-9-grades.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
1,Fell short on GCSE results day? Here's what to do you don't get accepted by your preferred sixth form or college,"GCSE results have landed, making it a happy day for some but a day others may want to forget. So what do you do if you didn't get into the sixth form you wanted? Can you resit your GCSE exams?",https://www.dailymail.co.uk/news/gcse/article-12432147/URL-gcse-results-sixth-form-college-grades-not-accepted-alternatives.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
2,"Yevgeny Prigozhin was assassinated 'as a gift for Zelensky to celebrate Ukraine's victory day today', Putin's former spokesman claims","Sergei Markov, a stern supporter of the Russian president and formerly a close advisor, said it was 'absolutely clear that Prigozhin [was] killed by [the] Ukrainian intelligence service.'",https://www.dailymail.co.uk/news/article-12440163/Yevgeny-Prigozhin-assassinated-gift-Zelensky-celebrate-Ukraines-victory-day-today-Putins-former-spokesman-claims.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
3,Moscow court extends arrest of jailed WSJ reporter Evan...,"US citizen Evan Gershokovich, who was jailed in Russia on espionage charges which can carry up to 20 years in prison, has had his pre-trial detention extended to November 30.",https://www.dailymail.co.uk/news/article-12440087/Jailed-WSJ-reporter-Evan-Gershkovich-arrives-hearing-extending-detention.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail
4,"GCSE results day 2023 LIVE: Pass grades fall for second year running in England, Wales and Northern Ireland - with 68.2% marked at 4/C","Follow MailOnline's liveblog today as hundreds of thousands of pupils in England, Wales and Northern Ireland pick up their GCSE results.",https://www.dailymail.co.uk/news/live/article-12439999/gcse-results-day-live-2023-exams-students-college-sixth-form.html?ns_mchannel=rss&ito=1490&ns_campaign=1490,Daily Mail


## Processing

In [90]:
# Subset the data

title_df = articles[["title", "source"]].copy()

In [91]:
title_df["keywords"] = title_df["title"].str.lower()

In [92]:
# Split into tokens

title_df["keywords"] = title_df["keywords"].apply(word_tokenize)

In [93]:

# Create an object that can be used to lemmatise

lemma = WordNetLemmatizer()

# Create a dictionary to map tags to ones that the lemmatiser will understand.

tag_map = defaultdict(lambda : "n")  # by default, assume nouns
tag_map['J'] = "a"  # adjectives
tag_map['V'] = "v"  # verbs
tag_map['R'] = "r"  # adverbs

# Create a function to get the pos tags for a set of tokens, and return the tokens in a way the
# lemmatizer can interpret
def get_wordnet_tags(tokens):
    """Returns WordNet pos_tags for a set of tokens"""
    
    # Tag tokens with pos_tagger
    tagged_tokens = pos_tag(tokens)
    
    # Convert each tag to a version wordnet can understand
    tagged_tokens = [(token[0], tag_map[token[1][0]]) for token in tagged_tokens]
    
    return tagged_tokens

In [94]:
# pos_tag the tokens

title_df["keywords"] = title_df["keywords"].apply(get_wordnet_tags)

# Lemmatise the tokens

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: [lemma.lemmatize(word=token[0], pos=token[1]) for token in tokens])

In [95]:
# Filter out punctuation, stop words, and very short words

stops = stopwords.words("english")

# Add specific stopwords

stops.extend(["n't"])

important_short_words = ["pm", "us", "uk", "gb"]

def filter_tokens(tokens):

    return [t for t in tokens
            if t not in stops
            and (len(t) > 2 or t in important_short_words)]

title_df["keywords"] = title_df["keywords"].apply(filter_tokens)

In [96]:
# Remove specifically apostrophes

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: [x.replace("'", "") for x in tokens])

In [97]:
# Join token lists back into strings

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: " ".join(tokens))

In [98]:
title_df.sample(3)

Unnamed: 0,title,source,keywords
148,The Hundred 2023: Birmingham Phoenix all-rounder Benny Howell on how ADHD affects his game,BBC,hundred 2023 birmingham phoenix all-round benny howell adhd affect game
58,Kemi Badenoch hails the UK's 'thriving relationship' with India as talks progress for a major new trade deal,Daily Mail,kemi badenoch hail uk thriving relationship india talk progress major new trade deal
96,Inheritance tax penalties soar as more families fall foul of the rules,Daily Mail,inheritance tax penalty soar family fall foul rule


## Data export

In [100]:
title_df.to_csv("./data/processed_headlines.csv", index=False)