# Headline analysis

This notebook explores the most frequent significant words in headlines.

## Imports

In [None]:
import pandas as pd
from pandas.core.common import flatten
import matplotlib.pyplot as plt
import seaborn as sns

## Setup

In [None]:
pd.set_option("max_colwidth", 0)

## Data sourcing

In [None]:
headlines = pd.read_csv("./data/processed_headlines.csv")

headlines.head()

## Most frequent words

### Analysis

In [None]:
# Split the keywords strings into lists

headlines["keywords"] = headlines["keywords"].str.split(" ")

In [None]:
# Split the titles by source

dm = headlines[headlines["source"] == "Daily Mail"]
bbc = headlines[headlines["source"] == "BBC"]

In [None]:
# Flatten keyword lists

all_dm_keywords = flatten(dm["keywords"])
all_bbc_keywords = flatten(bbc["keywords"])

In [None]:
# Convert to series (makes them very countable)

all_dm_keywords = pd.Series(all_dm_keywords)
all_bbc_keywords = pd.Series(all_bbc_keywords)

# Then count them

dm_keyword_counts = all_dm_keywords.value_counts()
bbc_keyword_counts = all_bbc_keywords.value_counts()

In [None]:
# Select the top ten of each

dm_top_ten = dm_keyword_counts.reset_index()[:10]
bbc_top_ten = bbc_keyword_counts.reset_index()[:10]

# Then graph them

fig, ax = plt.subplots(1, 2, figsize=(20, 10))

sns.barplot(dm_top_ten, x="index", y="count", ax=ax[0]).set(title='Daily Mail')
sns.barplot(bbc_top_ten, x="index", y="count", ax=ax[1]).set(title='BBC')