In [1]:
# library
import numpy as np
import pandas as pd
import plotly.express as px

import json

site_list = ["bbc", "cnn", "foxnews", "nationalreview"]

In [2]:
def open_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    # return data 

    # file_path = "data/" + site_list[0] + "/articles.json"

    # json_data = open_json(file_path)
    data = [{"url": url, **details} for url, details in data.items() if details is not None]
    df = pd.DataFrame(data)

    df_json = df[["title", "date_publish", "description", "maintext", "authors", "wayback_id"]]
    # df_json.describe()
    return df_json

In [14]:
def open_csv(file_path):
    df = pd.read_csv(file_path, header=None, encoding='utf-8')

    # convert the first column to datetime
    df[3] = pd.to_datetime((df[0]).astype(str).str[:8], format="%Y%m%d", errors='coerce')

    df.columns = ['id','url','status','wayback_time']
    
    df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
    df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')

    # sort the dataframe by date
    df_weekly = df.groupby('week_start_date')['url'].count().reset_index()
    df_year_week = df.groupby('year_week').count().reset_index()
    df_year_week.sort_values('year_week', inplace=True)

    
    return df

In [3]:
def cleanDF(df_json):
    # clean the data
    # drop the main text that is empty
    df_json = df_json.dropna(subset=["maintext"])

    # drop the duplicates
    df_json = df_json.drop_duplicates(subset=["maintext"])

    # add a new column based on the wayback_id
    df_json["wayback_time"] = pd.to_datetime((df_json["wayback_id"]).astype(str).str[:8], format='%Y%m%d', errors='coerce')

    # add a new column based on the length of the main text
    df_json["text_len"] = df_json["maintext"].apply(lambda x: len(x))

    df_json.drop(columns=["wayback_id"], inplace=True)

    # sort
    df_json = df_json.sort_values(by="wayback_time", ascending=True)

    # print(df_json.describe())
    # print(df_json.head())
    # print(df_json.head())
    return df_json

In [4]:
# filter the data back on title 

In [18]:
def graphDF(df_json, df_csv, site):
    # Preprocess df_json
    df_json['year_week'] = df_json['wayback_time'].dt.strftime('%Y-%U')
    df_json['week_start_date'] = pd.to_datetime(df_json['year_week'] + '-0', format='%Y-%U-%w')
    df_json_weekly = df_json.groupby('week_start_date')['title'].count().reset_index()
    df_json_weekly.rename(columns={'title': 'json_count'}, inplace=True)

    # Preprocess df_csv (assuming similar structure and column names as df_json)
    df_csv['year_week'] = df_csv['wayback_time'].dt.strftime('%Y-%U')
    df_csv['week_start_date'] = pd.to_datetime(df_csv['year_week'] + '-0', format='%Y-%U-%w')
    df_csv_weekly = df_csv.groupby('week_start_date')['url'].count().reset_index()
    df_csv_weekly.rename(columns={'url': 'csv_count'}, inplace=True)

    # Merge the two dataframes
    df_merged = pd.merge(df_json_weekly, df_csv_weekly, on='week_start_date', how='outer').fillna(0)

    # Plotting
    fig = px.line(
        df_merged,
        x='week_start_date',
        y=['json_count', 'csv_count'],
        labels={'week_start_date': 'Week Starting', 'value': 'Number of Articles', 'variable': 'Source'},
        template='seaborn'
    )

    fig.update_layout(
        xaxis_title='Week Starting',
        yaxis_title='Number of Articles',
        title=f'Number of Articles per Week for {site}',
        title_x=0.5,
        legend_title='Source',
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
    )

    fig.show()

# Example usage
# graphDF(df_json, df_csv, 'Example Site')


In [19]:
for site in site_list:
    file_path_json = "data/" + site + "/articles.json"
    df_json = open_json(file_path_json)
    df_json = cleanDF(df_json)

    file_path_csv = "data/" + site + "/urls_cleaned.csv"
    df_csv = open_csv(file_path_csv)
    
    graphDF(df_json, df_csv, site)
    print("done")

done


done


done


done
