In [20]:
# library
import numpy as np
import pandas as pd
import plotly.express as px

import json

site_list = ["bbc", "cnn", "foxnews", "nationalreview", "washingtontimes", "newsweek"]

In [21]:
def open_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    # return data 

    # file_path = "data/" + site_list[0] + "/articles.json"

    # json_data = open_json(file_path)
    data = [{"url": url, **details} for url, details in data.items() if details is not None]
    df = pd.DataFrame(data)

    df_json = df[["title", "authors", "url", "date_publish", "description", "maintext", "wayback_id"]]
    # df_json.describe()
    return df_json

In [22]:
def open_csv(file_path):
    df = pd.read_csv(file_path, header=None, encoding='utf-8')

    # convert the first column to datetime
    df[3] = pd.to_datetime((df[0]).astype(str).str[:8], format="%Y%m%d", errors='coerce')

    df.columns = ['id','url','status','wayback_time']
    
    df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
    df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')

    # sort the dataframe by date
    df_weekly = df.groupby('week_start_date')['url'].count().reset_index()
    df_year_week = df.groupby('year_week').count().reset_index()
    df_year_week.sort_values('year_week', inplace=True)

    
    return df

In [23]:
def cleanDF(df_json):
    # clean the data
    # drop the main text that is empty
    df_json = df_json.dropna(subset=["maintext"])

    # drop the duplicates
    df_json = df_json.drop_duplicates(subset=["maintext"])

    # add a new column based on the wayback_id
    df_json["wayback_time"] = pd.to_datetime((df_json["wayback_id"]).astype(str).str[:8], format='%Y%m%d', errors='coerce')

    # add a new column based on the length of the main text
    df_json["text_len"] = df_json["maintext"].apply(lambda x: len(x))

    df_json.drop(columns=["wayback_id"], inplace=True)

    # sort
    df_json = df_json.sort_values(by="wayback_time", ascending=True)

    # print(df_json.describe())
    # print(df_json.head())
    # print(df_json.head())
    return df_json

In [24]:
# filter the data back on title 

In [25]:
def graphDF(df_json, df_csv, site):
    # Preprocess df_json
    df_json['year_week'] = df_json['wayback_time'].dt.strftime('%Y-%U')
    df_json['week_start_date'] = pd.to_datetime(df_json['year_week'] + '-0', format='%Y-%U-%w')
    df_json_weekly = df_json.groupby('week_start_date')['title'].count().reset_index()
    df_json_weekly.rename(columns={'title': 'json_count'}, inplace=True)

    # Preprocess df_csv (assuming similar structure and column names as df_json)
    df_csv['year_week'] = df_csv['wayback_time'].dt.strftime('%Y-%U')
    df_csv['week_start_date'] = pd.to_datetime(df_csv['year_week'] + '-0', format='%Y-%U-%w')
    df_csv_weekly = df_csv.groupby('week_start_date')['url'].count().reset_index()
    df_csv_weekly.rename(columns={'url': 'csv_count'}, inplace=True)

    # Merge the two dataframes
    df_merged = pd.merge(df_json_weekly, df_csv_weekly, on='week_start_date', how='outer').fillna(0)

    # Plotting
    fig = px.line(
        df_merged,
        x='week_start_date',
        y=['json_count', 'csv_count'],
        labels={'week_start_date': 'Week Starting', 'value': 'Number of Articles', 'variable': 'Source'},
        template='seaborn'
    )

    fig.update_layout(
        xaxis_title='Week Starting',
        yaxis_title='Number of Articles',
        title=f'Number of Articles per Week for {site}',
        title_x=0.5,
        legend_title='Source',
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
    )

    fig.show()

# Example usage
# graphDF(df_json, df_csv, 'Example Site')


In [26]:
for site in site_list:
    file_path_json = "data/" + site + "/articles.json"
    df_json = open_json(file_path_json)
    df_json = cleanDF(df_json)

    file_path_csv = "data/" + site + "/urls_cleaned.csv"
    df_csv = open_csv(file_path_csv)
    
    graphDF(df_json, df_csv, site)
    print("done")

    # save the file
    # df_json.to_csv("data/articles_all/articles" + site +".csv", index=False)
    df_json.to_json("data/articles_all/articles" + site +".json", orient='records', lines=True)

done


done


done


done


done


done


In [27]:
site = "foxcrime"
file_path_json = "data/" + site + "/articles_new.json"
df_json = open_json(file_path_json)
df_json = cleanDF(df_json)

file_path_csv = "data/" + site + "/urls_cleaned.csv"
df_csv = open_csv(file_path_csv)

graphDF(df_json, df_csv, site)

# save the file
# df_json.to_csv("data/articles_all/articles" + site +".csv", index=False)
df_json.to_json("data/articles_all/articles" + site +".json", orient='records', lines=True)

In [28]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS

df_title = df_json[["title", "description"]].copy()

df_title.dropna(inplace=True)

# parse the title and description into lower case
df_title["title"] = df_title["title"].str.lower()
df_title["description"] = df_title["description"].str.lower()

# remove the punctuation
df_title["title"] = df_title["title"].str.replace(r'[^\w\s]','')
df_title["description"] = df_title["description"].str.replace(r'[^\w\s]','')

# remove the stop words
df_title["title"] = df_title["title"].apply(lambda x: " ".join([word for word in x.split() if word not in STOP_WORDS]))
df_title["description"] = df_title["description"].apply(lambda x: " ".join([word for word in x.split() if word not in STOP_WORDS]))

# remove the numbers
df_title["title"] = df_title["title"].str.replace(r'\d+', '')
df_title["description"] = df_title["description"].str.replace(r'\d+', '')





In [29]:
from collections import Counter

# Function to count word frequencies
def count_word_frequencies(text):
    # Split the text into words and count the occurrences of each word
    word_counts = Counter(text.split())
    return word_counts

# Apply the function to the title and description columns
df_title["title_word_counts"] = df_title["title"].apply(count_word_frequencies)
df_title["description_word_counts"] = df_title["description"].apply(count_word_frequencies)

# Display the DataFrame with word counts
# print(df_title[["title", "title_word_counts", "description", "description_word_counts"]].head())


In [30]:
from collections import Counter

# Combine all word counts from titles and descriptions
all_word_counts = Counter()
for index, row in df_title.iterrows():
    all_word_counts.update(row["title_word_counts"])
    # all_word_counts.update(row["description_word_counts"])

# Get the top 50 most common words
top_50_words = all_word_counts.most_common(50)
top_100_words = all_word_counts.most_common(100)

# Define a set of non-crime-related words
non_crime_words = {
    'new', 'florida', 'texas', 'nyc', 'california', 'says', '2', 'idaho', 'chicago',
    'georgia', 'carolina', 'york', 'city', 'video', 'car', 'home', 'years', 'family',
    'los', 'mom', 'angeles', 'north', 'men', 'state', 'judge', 'officials', '4', 'house',
    'near', 'people', 'sex', 'virginia', 'university', 'court', 'philadelphia', 'fire',
    'subway', 'nypd', '1', 'arizona', 'washington', 'women', 'school', 'teen', '1', '2', '3',
    'mother', 'woman', 'shows', 'girl', 'driver', 'student', 'child', 'search', 'leaves', 'found',
    'man', ':', ','
}

# Filter the top words list to exclude non-crime-related words
top_words_filtered = [(word, count) for word, count in top_100_words if word not in non_crime_words]

# Print the filtered list
print("Top words related to crime:")
for word, count in top_words_filtered:
    print(f"{word}: {count}")


# # Print the top 50 words with their counts
# print("Top 50 most frequent words:")
# for word, count in top_50_words:
#     print(f"{word}: {count}")

# # Print the top 100 words with their counts
# print("Top 100 most frequent words:")
# for word, count in top_100_words:
#     print(f"{word}: {count}")

Top words related to crime:
police: 580
suspect: 222
shooting: 200
murder: 180
allegedly: 150
arrested: 147
charged: 132
shot: 124
accused: 113
death: 107
killed: 103
officer: 88
missing: 88
dead: 80
crime: 78
arrest: 75
report: 71
officers: 65
killing: 63
case: 59
guilty: 55
murders:: 53
alleged: 51
trial: 50
charges: 50
prison: 48
injured: 47
fentanyl: 46
attack: 44
gun: 43
sentenced: 42
crash: 42
killer: 42
suspects: 41
dead,: 41
authorities: 40
convicted: 39
victim: 38
pleads: 35
dies: 35
fatally: 34
armed: 34
shooting:: 34
deputy: 33
cops: 32


In [31]:
# Load English tokenizer, tagger, 
# parser, NER and word vectors 
nlp = spacy.load("en_core_web_sm") 

# process top_words_filtered into a string
top_words = [word for word, count in top_words_filtered]
top_words_str = " ".join(top_words)

# # Process whole documents 
# text = ("""My name is Shaurya Uppal. I enjoy writing 
# 		articles on GeeksforGeeks checkout my other 
# 		article by going to my profile section.""") 

# doc = nlp(text) 

doc = nlp(top_words_str)

# for token in doc: 
#     # print(token, token.lemma_)
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
# 			token.shape_, token.is_alpha, token.is_stop)
    
crime_words = [token.lemma_ for token in doc]
print(crime_words, len(crime_words))

['police', 'suspect', 'shoot', 'murder', 'allegedly', 'arrest', 'charge', 'shot', 'accuse', 'death', 'kill', 'officer', 'miss', 'dead', 'crime', 'arrest', 'report', 'officer', 'kill', 'case', 'guilty', 'murder', ':', 'alleged', 'trial', 'charge', 'prison', 'injure', 'fentanyl', 'attack', 'gun', 'sentence', 'crash', 'killer', 'suspect', 'dead', ',', 'authority', 'convict', 'victim', 'plead', 'die', 'fatally', 'armed', 'shooting', ':', 'deputy', 'cop'] 48
