# Headline processing

This notebook cleans the headlines and produces a CSV containing simplified tokens.

## Imports

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from pandas.core.common import flatten
import matplotlib.pyplot as plt
import seaborn as sns

## Setup

In [None]:
pd.set_option("max_colwidth", 0)

## Data sourcing

In [None]:
articles = pd.read_csv("./data/articles.csv")

articles.head()

## Most frequent words

### Processing

In [None]:
# Subset the data

title_df = articles[["title", "source"]].copy()

In [None]:
title_df["keywords"] = title_df["title"].str.lower()

In [None]:
# Split into tokens

title_df["keywords"] = title_df["keywords"].apply(word_tokenize)

In [None]:

# Create an object that can be used to lemmatise

lemma = WordNetLemmatizer()

# Create a dictionary to map tags to ones that the lemmatiser will understand.

tag_map = defaultdict(lambda : "n")  # by default, assume nouns
tag_map['J'] = "a"  # adjectives
tag_map['V'] = "v"  # verbs
tag_map['R'] = "r"  # adverbs

# Create a function to get the pos tags for a set of tokens, and return the tokens in a way the
# lemmatizer can interpret
def get_wordnet_tags(tokens):
    """Returns WordNet pos_tags for a set of tokens"""
    
    # Tag tokens with pos_tagger
    tagged_tokens = pos_tag(tokens)
    
    # Convert each tag to a version wordnet can understand
    tagged_tokens = [(token[0], tag_map[token[1][0]]) for token in tagged_tokens]
    
    return tagged_tokens

In [None]:
# pos_tag the tokens

title_df["keywords"] = title_df["keywords"].apply(get_wordnet_tags)

# Lemmatise the tokens

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: [lemma.lemmatize(word=token[0], pos=token[1]) for token in tokens])

In [None]:
# Filter out punctuation, stop words, and very short words

stops = stopwords.words("english")

# Add specific stopwords

stops.extend(["n't"])

def filter_tokens(tokens):

    return [t for t in tokens
            if t not in stops
            and len(t) > 2]

title_df["keywords"] = title_df["keywords"].apply(filter_tokens)

In [None]:
# Remove specifically apostrophes

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: [x.replace("'", "") for x in tokens])

In [None]:
# Join token lists back into strings

title_df["keywords"] = title_df["keywords"].apply(lambda tokens: " ".join(tokens))

In [None]:
title_df.sample(3)

## Data export

In [None]:
title_df.to_csv("./data/processed_headlines.csv", index=False)