In [2]:
# library
import numpy as np
import pandas as pd
import plotly.express as px
import json

# site_list = [ "cnn", "foxnews", "washingtontimes", "newsweek", "dailybeast"]
site_list = {
    "right": "foxnews",
    "lean_right": "washingtontimes",
    "center": "newsweek",
    "lean_left": "cnn",
    "left": "dailybeast"
}

## Functions

In [3]:
def open_json(file_path, site_name):
    df_json = pd.read_json(file_path, lines=True)
    df_json['site_name'] = site_name
        
    return df_json

In [14]:
def get_crime_science_other(df_json):
    df_crime = df_json[df_json["category"].str.lower() == "crime"]
    df_science = df_json[df_json["category"].str.lower() == "science"]
    df_other = df_json[(df_json["category"].str.lower() != "crime") & (df_json["category"].str.lower() != "science") &  (df_json["category"].str.lower() != "both")]
    return df_crime, df_science, df_other

### Graphs

In [15]:
def graph_timeline(dataframes, site_list):
    # Prepare a list to collect data from all sites
    all_data = []

    # Process each site's DataFrame for weekly data
    # for tendency, site in site_list.items():
    #     df = dataframes[site]
    #     # Assuming 'wayback_time' is already a datetime column
    #     df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
    #     # df['year_month'] = df['wayback_time'].dt.strftime('%Y-%m') # add another column for month
    #     df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')
    #     df_weekly = df.groupby('week_start_date')['title'].count().reset_index()
    #     df_weekly['Source'] = site
    #     df_weekly['Tendency'] = tendency
    #     all_data.append(df_weekly)

    # Process each site's DataFrame for monthly data
    for tendency, site in site_list.items():
        df = dataframes[site]
        # Assuming 'wayback_time' is already a datetime column
        # df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
        df['year_month'] = df['wayback_time'].dt.strftime('%Y-%m') # add another column for month
        # df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')
        df_monthly = df.groupby('year_month')['title'].count().reset_index()
        df_monthly['Source'] = site
        df_monthly['Tendency'] = tendency
        all_data.append(df_monthly)

    # Combine all data into a single DataFrame
    combined_df = pd.concat(all_data)

    # Plotting
    fig = px.line(
        combined_df,
        # x='week_start_date',
        x = 'year_month',
        y='title',
        color='Source',
        line_group='Source',
        labels={'week_start_date': 'Week Starting', 'title': 'Number of Articles', 'Tendency': 'Political Tendency'},
        title='Number of Articles per Week by Political Tendency',
        template='seaborn'
    )

    fig.update_layout(
        xaxis_title='Year Time Line (unit: month)',
        yaxis_title='Number of Articles',
        title_x=0.5,
        legend_title='News Source',
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=1.2),
        width=1000,
        # yaxis=dict(range=[0, 2000])
    )

    fig.show()

In [16]:
def graph_wing_ratio(dataframes, site_list):
    all_data = []
    for tendency, site in site_list.items():
        df = dataframes[site]
        df_simple = df[['title']].copy()
        df_simple['Tendency'] = tendency
        df_simple['Source'] = site
        df_simple['Label'] = site + " (" + tendency + ")"  # Combine source and tendency into a label
        all_data.append(df_simple)

    # Combine all data into a single DataFrame
    combined_df = pd.concat(all_data)

    # Aggregate the data
    article_counts = combined_df.groupby(['Label', 'Tendency']).size().reset_index(name='Counts')

    # Plotting a pie chart
    fig = px.pie(
        article_counts,
        values='Counts',
        names='Label',  # Use the combined label for pie slice labels
        color='Tendency',  # Use Tendency for color distinction
        title='Number of Articles by Source and Political Tendency',
        template='seaborn',
        # color_discrete_sequence=px.colors.qualitative.Bold,

        hole=0.3  # Optional: to create a donut-like pie chart
    )
    
    # Update layout for clarity
    fig.update_traces(textposition='inside', textinfo='percent+label+value')
    fig.update_layout(
        legend_title='Political Tendency',
        legend=dict(yanchor="bottom", y=0.01, xanchor="right", x=1.15),
        width=800
    )
    
    fig.show()


In [17]:
def graph_category(df_all, kind = 'pie'):

    if 'category' not in df_all.columns:
        print('No category information available')
        return
    
    if kind == 'pie':
        category_count = df_all['category'].value_counts().reset_index()
        category_count.columns = ['category', 'count']

        fig = px.pie(
            category_count,
            values='count',
            names='category',
            title='Number of Articles by Category',
            template='seaborn',
            hole=0.3
        )

        fig.update_traces(textposition='inside', textinfo='percent+label+value')

        fig.update_layout(
            legend_title='Political Tendency',
            legend=dict(yanchor="bottom", y=0.01, xanchor="right", x=1.15),
            width=800
        )

    if kind == 'line':
        df_all['year_month'] = df_all['wayback_time'].dt.strftime('%Y-%m') # add another column for month
        category_count = df_all.groupby(['category', 'year_month']).size().reset_index(name='count')
        # category_count.columns = ['category', 'count']

        fig = px.line(
            category_count,
            x='year_month',
            y='count',
            color='category',
            title='Number of Articles by Category Overtime',
            template='seaborn'
        )

        fig.update_layout(
            xaxis_title='Category',
            yaxis_title='Number of Articles',
            title_x=0.5,
            width=800
        )


    fig.show()
    

## Main Codes


In [18]:
dataframes = {}
# load data
for _, site_name in site_list.items():
    file_path = f"./data/articles_all/json/categorized/articles{site_name}_category.json"
    try:
        df = open_json(file_path, site_name)
        dataframes[site_name] = df
        # print(df.head())

    except Exception as e:
        print(e)
        continue

In [19]:
# graph timeline
graph_timeline(dataframes, site_list)

In [20]:
graph_wing_ratio(dataframes, site_list)

In [21]:
data_list = []
for _, df in dataframes.items():
    data_list.append(df)

df_all = pd.concat(data_list, ignore_index=True)
# df_crime, df_science, df_other = get_crime_science_other(df_all)

In [22]:
graph_category(df_all, "pie")

In [23]:
graph_category(df_all, "line")