In [None]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import spacy
import scattertext as st
import pickle
from wordcloud import WordCloud

In [None]:
# read data
data = pd.read_csv("ireland-news-headlines.csv",parse_dates = ["publish_date"]).astype({"headline_category":"category"})

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.headline_category#.str.split(".")

In [None]:
# Drop the rows where 'headline_category' is NaN to avoid the TypeError
data = data.dropna(subset=['headline_category'])

# Count the dots in each 'headline_category', fill NaN values with 0, then get the max
max_dots = int(data['headline_category'].str.count('\.').fillna(0).max())

# Create new column names based on the max number of dots found
# Using your specific naming scheme
new_cols = ['main_category'] + ['1st sub-category'] + ['2nd sub-category'] + ['3rd sub-category']

# Split the 'headline_category' column and expand it into a DataFrame
split_data = data['headline_category'].str.split('\.', expand=True)

# Rename columns in the new DataFrame
split_data.columns = new_cols

# Merge the new DataFrame with the original DataFrame
data = pd.concat([data, split_data], axis=1)
data.drop('headline_category',axis = 1, inplace = True)




In [None]:
data[new_cols] = data[new_cols].astype("category")

In [None]:
data.info()

In [None]:
hist_columns = data.columns.difference(['headline_text','publish_date'])
for col in hist_columns:
    fig = px.histogram(data[col],x=col)
    fig.show()

In [None]:
# Resampling data on monthly basis
data_resampled = data.resample('M', on='publish_date').main_category.value_counts().reset_index(name='Count')

# Plotting the data
fig = px.line(data_resampled, x="publish_date", y="Count", color='main_category', title='Article Count by Main Category over Time')
fig.show()


In [None]:
# Resampling data on monthly basis
data_resampled = data.resample('M', on='publish_date')['1st sub-category'].value_counts().reset_index(name='Count')

# Plotting the data
fig = px.line(data_resampled, x="publish_date", y="Count", color='1st sub-category', title='Article Count by 1st sub-category over Time')
fig.show()

In [None]:
data.sample(10)

In [None]:

# Take a 1% random sample of each category from your DataFrame
data_sample = data.groupby('main_category').apply(lambda x: x.sample(frac=0.01, random_state=1)).reset_index(drop=True)

# This code has been used to create scattertext corpus , to make things easier the corpus has been saved and contains a 1% data sample.
'''
# Load the English model for spaCy
nlp = spacy.load('en_core_web_sm')

# Create a scattertext Corpus using the sampled DataFrame
corpus_sample = st.CorpusFromPandas(data_sample, 
                                    category_col='main_category', 
                                    text_col='headline_text', 
                                    nlp=nlp).build()
'''
# The code for loading the corpus if we need.
'''
with open('sample_corpus.pkl', 'rb') as f:
    corpus_sample = pickle.load(f)
'''

In [None]:
# Code for creating the html visualizations also to make running faster been saved on disk:
'''
# Iterate over each unique main_category value
for category in data_sample['main_category'].unique():
    # Generate the scattertext visualization for the current category
    html = st.produce_scattertext_explorer(corpus_sample,
                                           category=category,
                                           category_name=category,
                                           not_category_name='Other Categories',
                                           minimum_term_frequency=5,
                                           pmi_threshold_coefficient=5,
                                           width_in_pixels=1000,
                                           metadata=data_sample['publish_date'])

    # Save the visualization to an HTML file
    with open(f'Scattertext_Visualization_{category}.html', 'w') as f:
        f.write(html)

# Save the sample corpus to a file
with open('sample_corpus.pkl', 'wb') as f:
    pickle.dump(corpus_sample, f)

'''

In [None]:
text = " ".join(str(headline) for headline in data.headline_text)
wordcloud = WordCloud(background_color="white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')


In [None]:
categories = data['main_category'].unique()

for category in categories:
    text = " ".join(str(headline) for headline in data[data['main_category'] == category].headline_text)
    wordcloud = WordCloud(background_color="white").generate(text)
    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {category} Category')
    plt.axis('off')
    plt.show()
