In [100]:
# library
import numpy as np
import pandas as pd
import plotly.express as px
import json

In [101]:
# political leanings

political_list = {
    "foxnews": "right",
    "washingtontimes": "lean_right",
    "newsweek": "center",
    "cnn": "lean_left",
    "dailybeast": "left",
    "foxnews_science": "right",
    "washingtontimes_science": "lean_right",
    "newsweek_science": "center",
    "theatlantic_science": "lean_left",
    "slate_science": "left"
}

## Functions

In [102]:
def open_json(file_path, site_name):
    df_json = pd.read_json(file_path, lines=True)
    df_json['site_name'] = site_name
        
    return df_json

In [103]:
def get_crime_science_other(df_json):
    df_crime = df_json[df_json["category"].str.lower() == "crime"]
    df_science = df_json[df_json["category"].str.lower() == "science"]
    df_other = df_json[(df_json["category"].str.lower() != "crime") & (df_json["category"].str.lower() != "science") &  (df_json["category"].str.lower() != "both")]
    return df_crime, df_science, df_other

### Graphs

In [104]:
class graphs:
    def __init__(self, dataframes, site_list):
        self.dataframes = dataframes
        self.site_list = site_list
        # self.title = title
        # self.kind = kind

    def timeline(self, title):
        
        # Prepare a list to collect data from all sites
        all_data = []

        # Process each site_name's DataFrame for monthly data
        for site_name, tendency in self.site_list.items():
            site_name = site_name[:-5] # remove .json from site_name

            df = self.dataframes[site_name]
            # Assuming 'wayback_time' is already a datetime column
            # df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
            df['year_month'] = df['wayback_time'].dt.strftime('%Y-%m') # add another column for month
            # df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')
            df_monthly = df.groupby('year_month')['title'].count().reset_index()
            df_monthly['Source'] = site_name
            df_monthly['Tendency'] = tendency
            all_data.append(df_monthly)

        # Combine all data into a single DataFrame
        combined_df = pd.concat(all_data)

        # Plotting
        fig = px.line(
            combined_df,
            # x='week_start_date',
            x = 'year_month',
            y='title',
            color='Source',
            line_group='Source',
            labels={'week_start_date': 'Week Starting', 'title': 'Number of Articles', 'Tendency': 'Political Tendency'},
            title=title,
            template='seaborn'
        )

        fig.update_layout(
            xaxis_title='Year Time Line (unit: month)',
            yaxis_title='Number of Articles',
            title_x=0.5,
            legend_title='News Source',
            legend=dict(yanchor="top", y=0.99, xanchor="right", x=1.5),
            width=1000,
            # yaxis=dict(range=[0, 2000])
        )

        fig.show()
    
    def wing_ratio(self, title):

        all_data = []
        for site_name, _ in self.site_list.items():
            site_name = site_name[:-5] # remove .json from site_name

            df = self.dataframes[site_name]
            df_simple = df[['title']].copy()
            df_simple['Tendency'] = political_list[site_name]
            df_simple['Source'] = site_name
            df_simple['Label'] = site_name + " (" + political_list[site_name] + ")"  # Combine source and tendency into a label
            all_data.append(df_simple)

        # Combine all data into a single DataFrame
        combined_df = pd.concat(all_data)

        # Aggregate the data
        article_counts = combined_df.groupby(['Label', 'Tendency']).size().reset_index(name='Counts')

        # Plotting a pie chart
        fig = px.pie(
            article_counts,
            values='Counts',
            names='Label',  # Use the combined label for pie slice labels
            color='Tendency',  # Use Tendency for color distinction
            title=title,
            template='seaborn',
            # color_discrete_sequence=px.colors.qualitative.Bold,

            hole=0.3  # Optional: to create a donut-like pie chart
        )
        
        # Update layout for clarity
        fig.update_traces(textposition='inside', textinfo='percent+label+value')
        fig.update_layout(
            legend_title='Political Tendency',
            legend=dict(yanchor="bottom", y=0.01, xanchor="right", x=1.15),
            width=800
        )
        
        fig.show()

    def category(self, title, kind):

        if 'category' not in self.dataframes.columns:
            print('No category information available')
            return
        
        if kind == 'pie':
            category_count = self.dataframes['category'].value_counts().reset_index()
            category_count.columns = ['category', 'count']

            fig = px.pie(
                category_count,
                values='count',
                names='category',
                title=title,
                template='seaborn',
                hole=0.3
            )

            fig.update_traces(textposition='inside', textinfo='percent+label+value')

            fig.update_layout(
                legend_title='Political Tendency',
                legend=dict(yanchor="bottom", y=0.01, xanchor="right", x=1.13),
                width=800
            )

        if kind == 'line':
            self.dataframes['year_month'] = self.dataframes['wayback_time'].dt.strftime('%Y-%m') # add another column for month
            category_count = self.dataframes.groupby(['category', 'year_month']).size().reset_index(name='count')
            # category_count.columns = ['category', 'count']

            fig = px.line(
                category_count,
                x='year_month',
                y='count',
                color='category',
                title='Number of Articles by Category Overtime',
                template='seaborn'
            )

            fig.update_layout(
                xaxis_title='Category',
                yaxis_title='Number of Articles',
                title_x=0.5,
                width=800
            )


        fig.show()
    


## Main Codes


In [105]:
from getFileDir import findFileBFS
site_list = findFileBFS('data/categorized/', 'json')

for file_name, path in site_list.items():
    print(f"Path: {path}, File name: {file_name}")

Searching for json files in data/categorized/
data/categorized/theatlantic_science.json
data/categorized/cnn.json
data/categorized/washingtontimes_science.json
data/categorized/slate_science.json
data/categorized/foxnews.json
data/categorized/newsweek_science.json
data/categorized/newsweek.json
data/categorized/foxnews_science.json
data/categorized/washingtontimes.json
data/categorized/dailybeast.json
Path: data/categorized/theatlantic_science.json, File name: theatlantic_science.json
Path: data/categorized/cnn.json, File name: cnn.json
Path: data/categorized/washingtontimes_science.json, File name: washingtontimes_science.json
Path: data/categorized/slate_science.json, File name: slate_science.json
Path: data/categorized/foxnews.json, File name: foxnews.json
Path: data/categorized/newsweek_science.json, File name: newsweek_science.json
Path: data/categorized/newsweek.json, File name: newsweek.json
Path: data/categorized/foxnews_science.json, File name: foxnews_science.json
Path: data/

In [106]:
dataframes_us = {}
dataframes_science ={}

for site_name, file_path in site_list.items():
    site_name = site_name[:-5]
    if 'science' in site_name:
        print(f"Processing {site_name}")
        df = open_json(file_path, site_name)
        dataframes_science[site_name] = df
    else:
        df = open_json(file_path, site_name)
        dataframes_us[site_name] = df

Processing theatlantic_science
Processing washingtontimes_science
Processing slate_science
Processing newsweek_science
Processing foxnews_science


In [107]:
list_us = {}
list_science = {}
# list_crime = {} # equivalent to list_us

for site_name, path in site_list.items():
    if 'science' in site_name:
        list_science[site_name] = path
    else:
        list_us[site_name] = path

dataframes_crime = {}
for site_name, df in dataframes_us.items():
    df_crime, df_science, df_other = get_crime_science_other(df)
    dataframes_crime[site_name] = df_crime

In [108]:
graph_us = graphs(dataframes_us, list_us)
graph_science = graphs(dataframes_science, list_science)
graph_crime = graphs(dataframes_crime, list_us)

In [109]:
# graph timeline
graph_us.timeline("Number of Articles per Week for US News Sites")

In [110]:
graph_science.timeline("Number of Articles per Week for Science")

In [111]:
graph_crime.timeline("Number of Articles per Week for Crimes")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [112]:
graph_us.wing_ratio("Number of Articles by Political Tendency All US")

In [113]:
graph_science.wing_ratio("Number of Articles by Political Tendency for Science")

In [114]:
graph_crime.wing_ratio("Number of Articles by Political Tendency for Crimes")

In [115]:
# concatenate all dataframes
# df_all_us = pd.concat(dataframes_us.values())
df_all_science = pd.concat(dataframes_science.values())
df_all_crime = pd.concat(dataframes_crime.values())

# create graphs for all data
# graph_us_all = graphs(df_all_us, list_us)
graph_science_all = graphs(df_all_science, list_science)
graph_crime_all = graphs(df_all_crime, list_us)

print(f"For crime: {df_all_crime[["wayback_time", "text_len"]].describe()}")
print()
print(f"For science: {df_all_science.describe()}")
# print total count of articles
print(f"Total number of articles for crime and science: {len(df_all_crime) + len(df_all_science)}")

For crime:                         wayback_time      text_len
count                          33128  33128.000000
mean   2018-12-24 17:07:13.808258816   2895.828151
min              2013-01-01 00:00:00    136.000000
25%              2016-07-07 00:00:00   1158.000000
50%              2019-07-05 00:00:00   2350.000000
75%              2021-03-12 00:00:00   3822.000000
max              2024-01-01 00:00:00  61532.000000
std                              NaN   2415.586916

For science:                         wayback_time       text_len
count                         132537  132537.000000
mean   2018-04-30 22:08:48.522600960    4028.279499
min              2013-01-01 00:00:00      80.000000
25%              2016-01-16 00:00:00    2001.000000
50%              2018-03-19 00:00:00    3414.000000
75%              2020-06-06 00:00:00    5115.000000
max              2024-01-01 00:00:00   95836.000000
std                              NaN    3222.018938
Total number of articles for crime and science: 

In [118]:
graph_crime_all.category("Number of Articles for Crime", "line")

In [120]:
graph_science_all.category("Number of Articles for Science", "line")