# Articles 2016- 2020

In [1]:
import pandas as pd
import json
import re
import numpy as np
from datetime import datetime
import glob

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')  # Download the VADER lexicon
import matplotlib.pyplot as plt

import psycopg2
from sqlalchemy import create_engine

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# end df I want
# year, published, source, article, abortion, race, immigration, socio_eco, sentimentscores

## Load all abortion articles

In [2]:
# Define a dictionary to map month names to their numeric values
month_dict = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Define the conversion function
def convert_article(article):
    source = article['source']
    
    published_date = pd.NaT  # Initialize with a "Not a Time" value
    if 'published_date' in article:
        published_date_data = article['published_date']
        year = published_date_data.get('year')
        month = month_dict.get(published_date_data.get('month', 'January'), 1)
        day = published_date_data.get('day')
        if year is not None and month is not None and day is not None:
            try:
                published_date = pd.Timestamp(year=int(year), month=int(month), day=int(day))
            except ValueError:
                pass
            
    content = article['article_content']
    bias = article['bias']

    return {
        'Source': source,
        'Published': published_date,
        'Content': content,
        'Abortion': bias
    }

# Define a list to store the converted articles
all_articles = []

# Get a list of file paths matching the pattern
file_paths = glob.glob('abortion_articles/Abortion_*.json')

# Loop through the file paths
for file_path in file_paths:
    # Load the JSON file
    with open(file_path) as file:
        json_data = json.load(file)
        articles = json_data['Articles']
        
        # Convert each article and append to the list
        for article in articles:
            converted_article = convert_article(article)
            all_articles.append(converted_article)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_articles)

# Convert the Published column to datetime
df['Published'] = pd.to_datetime(df['Published'])

# Fill missing month values in the Published column with 1
df['Published'].fillna(pd.to_datetime('2016-01-01'), inplace=True)

# Extract the year from the 'Published' column
df['Year'] = df['Published'].dt.year

# Print the resulting DataFrame
df
# gives 2029 rows × 5 columns on abortion

Unnamed: 0,Source,Published,Content,Abortion,Year
0,CBS,2016-11-16,"In the wake of the election, the Anti-Defamati...",planned parenthood,2016
1,CBS,2016-12-15,U.S. intelligence sources are not just pointin...,planned parenthood,2016
2,CBS,2017-06-20,Tuesday's special election in Georgia's 6th Co...,planned parenthood,2017
3,CBS,2017-06-22,"Senate Republicans unveiled a ""discussion draf...",planned parenthood,2017
4,CBS,2017-04-04,Chelsea Clinton on Tuesday expressed doubt tha...,planned parenthood,2017
...,...,...,...,...,...
2024,NPR,2020-11-01,As the nation careens into the 2020 presidenti...,reproductive rights,2020
2025,NPR,2020-10-27,If you find yourself fighting with a friend ov...,reproductive rights,2020
2026,NPR,2020-09-17,Postmaster General Louis DeJoy arrives to test...,reproductive rights,2020
2027,NPR,2020-07-17,"Rep, John Lewis, who spoke at the 1963 March o...",reproductive rights,2020


In [3]:
# Check for NaN values in the DataFrame
nan_rows = df[df.isna().any(axis=1)]

# Print the rows with NaN values
print(nan_rows)


Empty DataFrame
Columns: [Source, Published, Content, Abortion, Year]
Index: []


In [4]:
# Check for duplicate rows with all column content the same
duplicate_rows = df.duplicated(keep=False)

# Print the duplicate rows
print(df[duplicate_rows])

    Source  Published                                            Content   
404    CNN 2016-01-01                                                     \
411    CNN 2016-01-01                                                      
447    CNN 2016-01-01                                                      
503    CNN 2016-01-01                                                      
517    CNN 2016-01-01                                                      
648    CNN 2016-01-01                                                      
650    CNN 2016-01-01                                                      
654    CNN 2016-01-01                                                      
667    CNN 2016-01-01                                                      
668    CNN 2016-01-01                                                      
680    CNN 2016-01-01                                                      
697    CNN 2016-01-01                                                      
698    CNN 2

In [5]:
# Remove rows with empty "Content" column
df = df[df['Content'] != '']

# Print the updated DataFrame
print(df)
# gives [1993 rows x 5 columns]

     Source  Published                                            Content   
0       CBS 2016-11-16  In the wake of the election, the Anti-Defamati...  \
1       CBS 2016-12-15  U.S. intelligence sources are not just pointin...   
2       CBS 2017-06-20  Tuesday's special election in Georgia's 6th Co...   
3       CBS 2017-06-22  Senate Republicans unveiled a "discussion draf...   
4       CBS 2017-04-04  Chelsea Clinton on Tuesday expressed doubt tha...   
...     ...        ...                                                ...   
2024    NPR 2020-11-01  As the nation careens into the 2020 presidenti...   
2025    NPR 2020-10-27  If you find yourself fighting with a friend ov...   
2026    NPR 2020-09-17  Postmaster General Louis DeJoy arrives to test...   
2027    NPR 2020-07-17  Rep, John Lewis, who spoke at the 1963 March o...   
2028    NPR 2020-02-11  A voter leaves a polling booth at the Ward Fiv...   

                 Abortion  Year  
0      planned parenthood  2016  
1      

In [6]:
# Remove duplicate rows when present
df = df.drop_duplicates()

# Print the DataFrame without duplicates
df
# gives 1992 rows × 5 columns

Unnamed: 0,Source,Published,Content,Abortion,Year
0,CBS,2016-11-16,"In the wake of the election, the Anti-Defamati...",planned parenthood,2016
1,CBS,2016-12-15,U.S. intelligence sources are not just pointin...,planned parenthood,2016
2,CBS,2017-06-20,Tuesday's special election in Georgia's 6th Co...,planned parenthood,2017
3,CBS,2017-06-22,"Senate Republicans unveiled a ""discussion draf...",planned parenthood,2017
4,CBS,2017-04-04,Chelsea Clinton on Tuesday expressed doubt tha...,planned parenthood,2017
...,...,...,...,...,...
2024,NPR,2020-11-01,As the nation careens into the 2020 presidenti...,reproductive rights,2020
2025,NPR,2020-10-27,If you find yourself fighting with a friend ov...,reproductive rights,2020
2026,NPR,2020-09-17,Postmaster General Louis DeJoy arrives to test...,reproductive rights,2020
2027,NPR,2020-07-17,"Rep, John Lewis, who spoke at the 1963 March o...",reproductive rights,2020


In [7]:
# Arrange columns: Move the Year column to the front
new_cols = ['Year', 'Source', 'Published', 'Content', 'Abortion']
abortion_data = df.reindex(columns=new_cols)
abortion_data
# gives 1992 rows 5 columns

Unnamed: 0,Year,Source,Published,Content,Abortion
0,2016,CBS,2016-11-16,"In the wake of the election, the Anti-Defamati...",planned parenthood
1,2016,CBS,2016-12-15,U.S. intelligence sources are not just pointin...,planned parenthood
2,2017,CBS,2017-06-20,Tuesday's special election in Georgia's 6th Co...,planned parenthood
3,2017,CBS,2017-06-22,"Senate Republicans unveiled a ""discussion draf...",planned parenthood
4,2017,CBS,2017-04-04,Chelsea Clinton on Tuesday expressed doubt tha...,planned parenthood
...,...,...,...,...,...
2024,2020,NPR,2020-11-01,As the nation careens into the 2020 presidenti...,reproductive rights
2025,2020,NPR,2020-10-27,If you find yourself fighting with a friend ov...,reproductive rights
2026,2020,NPR,2020-09-17,Postmaster General Louis DeJoy arrives to test...,reproductive rights
2027,2020,NPR,2020-07-17,"Rep, John Lewis, who spoke at the 1963 March o...",reproductive rights


### Clean the data: Estimated time
19 minutes on 1992 abortion articles

In [8]:
def trim_length(text):
    '''
    This method removes the repeating characters that are repeated more than 2 times
    '''
    pat = re.compile(r"(.)\1{2,}")
    return pat.sub(r"\1\1", text)

def word_correct(myStr):
    '''
    This method accepts a string and removes the meta characters from it
    '''
    myStr = str(myStr)
    pat = re.compile(r'[^a-zA-Z1-9]+')
    corrected_str = ''
    splits = myStr.split()
    for word in splits:
        word = word.strip()
        word = re.sub(pat, '', word).lower()
        word = trim_length(word)
        corrected_str = corrected_str+word+' '
    corrected_str = corrected_str.strip()
    return corrected_str

def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_word(myStr):
    myStr = str(myStr)
    lemmatizer = WordNetLemmatizer()
    lst_word = nltk.word_tokenize(myStr)
    final_str=''
    for word in lst_word:
        word = lemmatizer.lemmatize(word, get_pos(word))
        final_str = final_str+word+' '

    final_str = final_str.strip()
    return final_str

abortion_data['Content'] = abortion_data['Content'].apply(lambda i:word_correct(i))
abortion_data['Content'] = abortion_data['Content'].apply(lambda i:lemmatize_word(i))
print('Done')
# took about 19 minutes on 1992 abortion articles

Done


In [11]:
# view data
abortion_data

Unnamed: 0,Year,Source,Published,Content,Abortion
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood
...,...,...,...,...,...
2024,2020,NPR,2020-11-01,a the nation careen into the 22 presidential e...,reproductive rights
2025,2020,NPR,2020-10-27,if you find yourself fight with a friend over ...,reproductive rights
2026,2020,NPR,2020-09-17,postmaster general louis dejoy arrives to test...,reproductive rights
2027,2020,NPR,2020-07-17,rep john lewis who spoke at the 1963 march on ...,reproductive rights


## Get sentiment scores

In [12]:
def get_sentiment_scores(df):
    # Initialize the VADER sentiment analyzer
    sia = SentimentIntensityAnalyzer()
        # Create a new column for sentiment scores
    df['SentimentScores'] = df['Content'].apply(lambda tweet: sia.polarity_scores(tweet))
    
    # Remove the 'neu' and 'compound' columns from the SentimentScores column
    df['SentimentScores'] = df['SentimentScores'].apply(lambda scores: {k: v for k, v in scores.items() if k not in ['neu', 'compound']})
    
    return df

In [13]:
# Call sentiment function
abortion_sentiment = get_sentiment_scores(abortion_data)
abortion_sentiment

Unnamed: 0,Year,Source,Published,Content,Abortion,SentimentScores
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,"{'neg': 0.146, 'pos': 0.073}"
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,"{'neg': 0.072, 'pos': 0.086}"
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,"{'neg': 0.087, 'pos': 0.059}"
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,"{'neg': 0.055, 'pos': 0.079}"
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,"{'neg': 0.061, 'pos': 0.141}"
...,...,...,...,...,...,...
2024,2020,NPR,2020-11-01,a the nation careen into the 22 presidential e...,reproductive rights,"{'neg': 0.121, 'pos': 0.113}"
2025,2020,NPR,2020-10-27,if you find yourself fight with a friend over ...,reproductive rights,"{'neg': 0.138, 'pos': 0.103}"
2026,2020,NPR,2020-09-17,postmaster general louis dejoy arrives to test...,reproductive rights,"{'neg': 0.07, 'pos': 0.063}"
2027,2020,NPR,2020-07-17,rep john lewis who spoke at the 1963 march on ...,reproductive rights,"{'neg': 0.086, 'pos': 0.129}"


In [14]:
# show values
abortion_sentiment['SentimentScores'].value_counts()

SentimentScores
{'neg': 0.049, 'pos': 0.103}    3
{'neg': 0.061, 'pos': 0.123}    3
{'neg': 0.096, 'pos': 0.116}    3
{'neg': 0.032, 'pos': 0.106}    3
{'neg': 0.067, 'pos': 0.095}    3
                               ..
{'neg': 0.146, 'pos': 0.074}    1
{'neg': 0.145, 'pos': 0.116}    1
{'neg': 0.04, 'pos': 0.079}     1
{'neg': 0.088, 'pos': 0.09}     1
{'neg': 0.07, 'pos': 0.063}     1
Name: count, Length: 1869, dtype: int64

In [15]:
# Extract the sentiment scores from the 'SentimentScores' column to normalize them
abortion_sentiment['Norm_Neg'] = abortion_sentiment['SentimentScores'].apply(lambda x: x['neg'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
abortion_sentiment['Norm_Pos'] = abortion_sentiment['SentimentScores'].apply(lambda x: x['pos'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
abortion_sentiment

Unnamed: 0,Year,Source,Published,Content,Abortion,SentimentScores,Norm_Neg,Norm_Pos
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,"{'neg': 0.146, 'pos': 0.073}",0.666667,0.333333
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,"{'neg': 0.072, 'pos': 0.086}",0.455696,0.544304
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,"{'neg': 0.087, 'pos': 0.059}",0.595890,0.404110
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,"{'neg': 0.055, 'pos': 0.079}",0.410448,0.589552
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,"{'neg': 0.061, 'pos': 0.141}",0.301980,0.698020
...,...,...,...,...,...,...,...,...
2024,2020,NPR,2020-11-01,a the nation careen into the 22 presidential e...,reproductive rights,"{'neg': 0.121, 'pos': 0.113}",0.517094,0.482906
2025,2020,NPR,2020-10-27,if you find yourself fight with a friend over ...,reproductive rights,"{'neg': 0.138, 'pos': 0.103}",0.572614,0.427386
2026,2020,NPR,2020-09-17,postmaster general louis dejoy arrives to test...,reproductive rights,"{'neg': 0.07, 'pos': 0.063}",0.526316,0.473684
2027,2020,NPR,2020-07-17,rep john lewis who spoke at the 1963 march on ...,reproductive rights,"{'neg': 0.086, 'pos': 0.129}",0.400000,0.600000


In [16]:
# This function only gets the compound sentiment
def get_compound_sentiment(df):
    # Initialize the VADER sentiment analyzer
    sia = SentimentIntensityAnalyzer()
        # Create a new column for sentiment scores
    df['SentimentScores'] = df['Content'].apply(lambda tweet: sia.polarity_scores(tweet))
    
    # Remove the 'neu' column from the SentimentScores column
    df['Compound'] = df['SentimentScores'].apply(lambda scores: {k: v for k, v in scores.items() if k not in ['neu','pos','neg']})
    
    return df

In [17]:
# run compound sentiment on All articles
abortion_all_sentiment = get_compound_sentiment(abortion_sentiment)
# Drop Sentiment Scores column
abortion_all_sentiment = abortion_sentiment.drop(['SentimentScores'], axis=1)
abortion_all_sentiment
# takes 28 seconds

Unnamed: 0,Year,Source,Published,Content,Abortion,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,0.666667,0.333333,{'compound': -0.9989}
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,0.455696,0.544304,{'compound': 0.8768}
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,0.595890,0.404110,{'compound': -0.9698}
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,0.410448,0.589552,{'compound': 0.983}
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,0.301980,0.698020,{'compound': 0.9893}
...,...,...,...,...,...,...,...,...
2024,2020,NPR,2020-11-01,a the nation careen into the 22 presidential e...,reproductive rights,0.517094,0.482906,{'compound': -0.9826}
2025,2020,NPR,2020-10-27,if you find yourself fight with a friend over ...,reproductive rights,0.572614,0.427386,{'compound': -0.9961}
2026,2020,NPR,2020-09-17,postmaster general louis dejoy arrives to test...,reproductive rights,0.526316,0.473684,{'compound': -0.8176}
2027,2020,NPR,2020-07-17,rep john lewis who spoke at the 1963 march on ...,reproductive rights,0.400000,0.600000,{'compound': 0.999}


In [18]:
# Extract the compound value from the dictionary
# Update the "Compound" column
abortion_all_sentiment['Compound'] = abortion_all_sentiment['Compound'].apply(lambda x: x['compound'])

# Convert the "Compound" column to a numeric data type
abortion_all_sentiment['Compound'] = pd.to_numeric(abortion_all_sentiment['Compound'])

# Print the updated DataFrame
abortion_all_sentiment
# gives 1992 rows × 8 columns

Unnamed: 0,Year,Source,Published,Content,Abortion,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,0.666667,0.333333,-0.9989
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,0.455696,0.544304,0.8768
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,0.595890,0.404110,-0.9698
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,0.410448,0.589552,0.9830
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,0.301980,0.698020,0.9893
...,...,...,...,...,...,...,...,...
2024,2020,NPR,2020-11-01,a the nation careen into the 22 presidential e...,reproductive rights,0.517094,0.482906,-0.9826
2025,2020,NPR,2020-10-27,if you find yourself fight with a friend over ...,reproductive rights,0.572614,0.427386,-0.9961
2026,2020,NPR,2020-09-17,postmaster general louis dejoy arrives to test...,reproductive rights,0.526316,0.473684,-0.8176
2027,2020,NPR,2020-07-17,rep john lewis who spoke at the 1963 march on ...,reproductive rights,0.400000,0.600000,0.9990


In [19]:
# Because we need to add a column to postgress before this data is uploaded and because the text is auto-lowercase,
# Postgress requires change the name of the 'Abortion' column to 'abortion'
abortion_all_sentiment.rename(columns={'Abortion': 'abortion'}, inplace=True)
abortion_all_sentiment

Unnamed: 0,Year,Source,Published,Content,abortion,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,0.666667,0.333333,-0.9989
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,0.455696,0.544304,0.8768
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,0.595890,0.404110,-0.9698
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,0.410448,0.589552,0.9830
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,0.301980,0.698020,0.9893
...,...,...,...,...,...,...,...,...
2024,2020,NPR,2020-11-01,a the nation careen into the 22 presidential e...,reproductive rights,0.517094,0.482906,-0.9826
2025,2020,NPR,2020-10-27,if you find yourself fight with a friend over ...,reproductive rights,0.572614,0.427386,-0.9961
2026,2020,NPR,2020-09-17,postmaster general louis dejoy arrives to test...,reproductive rights,0.526316,0.473684,-0.8176
2027,2020,NPR,2020-07-17,rep john lewis who spoke at the 1963 march on ...,reproductive rights,0.400000,0.600000,0.9990


# Load Race articles

In [19]:
# Define a dictionary to map month names to their numeric values
month_dict = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Define the conversion function
def convert_article(article):
    source = article['source']
    
    published_date = pd.NaT  # Initialize with a "Not a Time" value
    if 'published_date' in article:
        published_date_data = article['published_date']
        year = published_date_data.get('year')
        month = month_dict.get(published_date_data.get('month', 'January'), 1)
        day = published_date_data.get('day')
        if year is not None and month is not None and day is not None:
            try:
                published_date = pd.Timestamp(year=int(year), month=int(month), day=int(day))
            except ValueError:
                pass
            
    content = article['article_content']
    bias = article['bias']

    return {
        'Source': source,
        'Published': published_date,
        'Content': content,
        'Race': bias
    }

# Define a list to store the converted articles
all_articles = []

# Get a list of file paths matching the pattern
file_paths = glob.glob('race_articles/racial_*.json')

# Loop through the file paths
for file_path in file_paths:
    # Load the JSON file
    with open(file_path) as file:
        json_data = json.load(file)
        articles = json_data['Articles']
        
        # Convert each article and append to the list
        for article in articles:
            converted_article = convert_article(article)
            all_articles.append(converted_article)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_articles)

# Convert the Published column to datetime
df['Published'] = pd.to_datetime(df['Published'])

# Fill missing month values in the Published column with 1
df['Published'].fillna(pd.to_datetime('2016-01-01'), inplace=True)

# Extract the year from the 'Published' column
df['Year'] = df['Published'].dt.year

# Print the resulting DataFrame
df
# gives 3048 articles

Unnamed: 0,Source,Published,Content,Race,Year
0,CBS,2016-11-23,"For “Sex and the City” star Kristin Davis, the...",white privilege,2016
1,CBS,2016-12-08,"Over the next two months, leading up to his in...",white privilege,2016
2,CBS,2016-11-27,"JOHN DICKERSON, CBS HOST: Today on FACE THE NA...",white privilege,2016
3,CBS,2016-04-16,"Produced by Lourdes Aguiar, Peter Shaw, and Je...",white privilege,2016
4,CBS,2016-12-06,Will the tale of two Steves end in warfare wit...,white privilege,2016
...,...,...,...,...,...
3043,NPR,2020-09-24,Vehicles make their way along Interstate 205 S...,BLM,2020
3044,NPR,2020-07-15,Georgia Sen. Kelly Loeffler is a co-owner of A...,BLM,2020
3045,NPR,2020-05-28,"Because of the pandemic, the Bureau of Land Ma...",BLM,2020
3046,NPR,2020-08-10,Western GOP senators facing tough reelection b...,BLM,2020


In [20]:
# Check for NaN values in the DataFrame
nan_rows = df[df.isna().any(axis=1)]

# Print the rows with NaN values
print(nan_rows)

Empty DataFrame
Columns: [Source, Published, Content, Race, Year]
Index: []


In [21]:
# Remove rows with empty "Content" column
df = df[df['Content'] != '']

# Print the updated DataFrame
df
# gives 2954 race articles

Unnamed: 0,Source,Published,Content,Race,Year
0,CBS,2016-11-23,"For “Sex and the City” star Kristin Davis, the...",white privilege,2016
1,CBS,2016-12-08,"Over the next two months, leading up to his in...",white privilege,2016
2,CBS,2016-11-27,"JOHN DICKERSON, CBS HOST: Today on FACE THE NA...",white privilege,2016
3,CBS,2016-04-16,"Produced by Lourdes Aguiar, Peter Shaw, and Je...",white privilege,2016
4,CBS,2016-12-06,Will the tale of two Steves end in warfare wit...,white privilege,2016
...,...,...,...,...,...
3043,NPR,2020-09-24,Vehicles make their way along Interstate 205 S...,BLM,2020
3044,NPR,2020-07-15,Georgia Sen. Kelly Loeffler is a co-owner of A...,BLM,2020
3045,NPR,2020-05-28,"Because of the pandemic, the Bureau of Land Ma...",BLM,2020
3046,NPR,2020-08-10,Western GOP senators facing tough reelection b...,BLM,2020


In [22]:
# Check for duplicate rows with all column content the same
duplicate_rows = df.duplicated(keep=False)

# Print the duplicate rows
print(df[duplicate_rows])

     Source  Published                                            Content   
155     CBS 2020-11-16  On election night, 2016, then-President Barack...  \
159     CBS 2020-11-16  On election night, 2016, then-President Barack...   
2019    FOX 2020-06-03  Presumptive Democratic presidential nominee Jo...   
2071    FOX 2020-06-03  Presumptive Democratic presidential nominee Jo...   

                       Race  Year  
155   racial discrimination  2020  
159   racial discrimination  2020  
2019                 racial  2020  
2071                 racial  2020  


In [23]:
# Remove duplicate rows when present
df = df.drop_duplicates()

# Print the DataFrame without duplicates
df
# gives 2952 articles

Unnamed: 0,Source,Published,Content,Race,Year
0,CBS,2016-11-23,"For “Sex and the City” star Kristin Davis, the...",white privilege,2016
1,CBS,2016-12-08,"Over the next two months, leading up to his in...",white privilege,2016
2,CBS,2016-11-27,"JOHN DICKERSON, CBS HOST: Today on FACE THE NA...",white privilege,2016
3,CBS,2016-04-16,"Produced by Lourdes Aguiar, Peter Shaw, and Je...",white privilege,2016
4,CBS,2016-12-06,Will the tale of two Steves end in warfare wit...,white privilege,2016
...,...,...,...,...,...
3043,NPR,2020-09-24,Vehicles make their way along Interstate 205 S...,BLM,2020
3044,NPR,2020-07-15,Georgia Sen. Kelly Loeffler is a co-owner of A...,BLM,2020
3045,NPR,2020-05-28,"Because of the pandemic, the Bureau of Land Ma...",BLM,2020
3046,NPR,2020-08-10,Western GOP senators facing tough reelection b...,BLM,2020


In [25]:
# Arrange columns: Move the Year column to the front
new_cols = ['Year', 'Source', 'Published', 'Content', 'Race']
race_data = df.reindex(columns=new_cols)
race_data
# gives 1992 rows 5 columns

Unnamed: 0,Year,Source,Published,Content,Race
0,2016,CBS,2016-11-23,"For “Sex and the City” star Kristin Davis, the...",white privilege
1,2016,CBS,2016-12-08,"Over the next two months, leading up to his in...",white privilege
2,2016,CBS,2016-11-27,"JOHN DICKERSON, CBS HOST: Today on FACE THE NA...",white privilege
3,2016,CBS,2016-04-16,"Produced by Lourdes Aguiar, Peter Shaw, and Je...",white privilege
4,2016,CBS,2016-12-06,Will the tale of two Steves end in warfare wit...,white privilege
...,...,...,...,...,...
3043,2020,NPR,2020-09-24,Vehicles make their way along Interstate 205 S...,BLM
3044,2020,NPR,2020-07-15,Georgia Sen. Kelly Loeffler is a co-owner of A...,BLM
3045,2020,NPR,2020-05-28,"Because of the pandemic, the Bureau of Land Ma...",BLM
3046,2020,NPR,2020-08-10,Western GOP senators facing tough reelection b...,BLM


### Clean the data: Estimated time
33 minutes on 2952 race articles

In [26]:
# call cleaning functions
race_data['Content'] = race_data['Content'].apply(lambda i:word_correct(i))
race_data['Content'] = race_data['Content'].apply(lambda i:lemmatize_word(i))
print('Done')

Done


### Get sentiment scores

In [28]:
# Call sentiment function (this takes about a minute)
race_sentiment = get_sentiment_scores(race_data)
race_sentiment

Unnamed: 0,Year,Source,Published,Content,Race,SentimentScores
0,2016,CBS,2016-11-23,for sex and the city star kristin davis the el...,white privilege,"{'neg': 0.129, 'pos': 0.117}"
1,2016,CBS,2016-12-08,over the next two month lead up to his inaugur...,white privilege,"{'neg': 0.054, 'pos': 0.123}"
2,2016,CBS,2016-11-27,john dickerson cbs host today on face the nati...,white privilege,"{'neg': 0.05, 'pos': 0.159}"
3,2016,CBS,2016-04-16,produce by lourdes aguiar peter shaw and jenni...,white privilege,"{'neg': 0.108, 'pos': 0.084}"
4,2016,CBS,2016-12-06,will the tale of two steves end in warfare wit...,white privilege,"{'neg': 0.062, 'pos': 0.082}"
...,...,...,...,...,...,...
3043,2020,NPR,2020-09-24,vehicle make their way along interstate 25 sou...,BLM,"{'neg': 0.07, 'pos': 0.115}"
3044,2020,NPR,2020-07-15,georgia sen kelly loeffler be a coowner of atl...,BLM,"{'neg': 0.113, 'pos': 0.124}"
3045,2020,NPR,2020-05-28,because of the pandemic the bureau of land man...,BLM,"{'neg': 0.073, 'pos': 0.082}"
3046,2020,NPR,2020-08-10,western gop senator face tough reelection bid ...,BLM,"{'neg': 0.063, 'pos': 0.049}"


In [29]:
# Extract the sentiment scores from the 'SentimentScores' column to normalize them
race_sentiment['Norm_Neg'] = race_sentiment['SentimentScores'].apply(lambda x: x['neg'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
race_sentiment['Norm_Pos'] = race_sentiment['SentimentScores'].apply(lambda x: x['pos'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
race_sentiment

Unnamed: 0,Year,Source,Published,Content,Race,SentimentScores,Norm_Neg,Norm_Pos
0,2016,CBS,2016-11-23,for sex and the city star kristin davis the el...,white privilege,"{'neg': 0.129, 'pos': 0.117}",0.524390,0.475610
1,2016,CBS,2016-12-08,over the next two month lead up to his inaugur...,white privilege,"{'neg': 0.054, 'pos': 0.123}",0.305085,0.694915
2,2016,CBS,2016-11-27,john dickerson cbs host today on face the nati...,white privilege,"{'neg': 0.05, 'pos': 0.159}",0.239234,0.760766
3,2016,CBS,2016-04-16,produce by lourdes aguiar peter shaw and jenni...,white privilege,"{'neg': 0.108, 'pos': 0.084}",0.562500,0.437500
4,2016,CBS,2016-12-06,will the tale of two steves end in warfare wit...,white privilege,"{'neg': 0.062, 'pos': 0.082}",0.430556,0.569444
...,...,...,...,...,...,...,...,...
3043,2020,NPR,2020-09-24,vehicle make their way along interstate 25 sou...,BLM,"{'neg': 0.07, 'pos': 0.115}",0.378378,0.621622
3044,2020,NPR,2020-07-15,georgia sen kelly loeffler be a coowner of atl...,BLM,"{'neg': 0.113, 'pos': 0.124}",0.476793,0.523207
3045,2020,NPR,2020-05-28,because of the pandemic the bureau of land man...,BLM,"{'neg': 0.073, 'pos': 0.082}",0.470968,0.529032
3046,2020,NPR,2020-08-10,western gop senator face tough reelection bid ...,BLM,"{'neg': 0.063, 'pos': 0.049}",0.562500,0.437500


In [30]:
# run compound sentiment on All articles (this takes about a minute)
race_all_sentiment = get_compound_sentiment(race_sentiment)
# Drop Sentiment Scores column
race_all_sentiment = race_sentiment.drop(['SentimentScores'], axis=1)
race_all_sentiment
# takes 28 seconds

Unnamed: 0,Year,Source,Published,Content,Race,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-23,for sex and the city star kristin davis the el...,white privilege,0.524390,0.475610,{'compound': -0.8677}
1,2016,CBS,2016-12-08,over the next two month lead up to his inaugur...,white privilege,0.305085,0.694915,{'compound': 0.9999}
2,2016,CBS,2016-11-27,john dickerson cbs host today on face the nati...,white privilege,0.239234,0.760766,{'compound': 1.0}
3,2016,CBS,2016-04-16,produce by lourdes aguiar peter shaw and jenni...,white privilege,0.562500,0.437500,{'compound': -0.9996}
4,2016,CBS,2016-12-06,will the tale of two steves end in warfare wit...,white privilege,0.430556,0.569444,{'compound': 0.9631}
...,...,...,...,...,...,...,...,...
3043,2020,NPR,2020-09-24,vehicle make their way along interstate 25 sou...,BLM,0.378378,0.621622,{'compound': 0.9917}
3044,2020,NPR,2020-07-15,georgia sen kelly loeffler be a coowner of atl...,BLM,0.476793,0.523207,{'compound': 0.5089}
3045,2020,NPR,2020-05-28,because of the pandemic the bureau of land man...,BLM,0.470968,0.529032,{'compound': 0.9187}
3046,2020,NPR,2020-08-10,western gop senator face tough reelection bid ...,BLM,0.562500,0.437500,{'compound': -0.6643}


In [31]:
# Extract to compound value from the dictionary
# Update the "Compound" column
race_all_sentiment['Compound'] = race_all_sentiment['Compound'].apply(lambda x: x['compound'])

# Convert the "Compound" column to a numeric data type
race_all_sentiment['Compound'] = pd.to_numeric(race_all_sentiment['Compound'])

# Print the updated DataFrame
race_all_sentiment
# gives 1992 rows × 8 columns

Unnamed: 0,Year,Source,Published,Content,Race,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-23,for sex and the city star kristin davis the el...,white privilege,0.524390,0.475610,-0.8677
1,2016,CBS,2016-12-08,over the next two month lead up to his inaugur...,white privilege,0.305085,0.694915,0.9999
2,2016,CBS,2016-11-27,john dickerson cbs host today on face the nati...,white privilege,0.239234,0.760766,1.0000
3,2016,CBS,2016-04-16,produce by lourdes aguiar peter shaw and jenni...,white privilege,0.562500,0.437500,-0.9996
4,2016,CBS,2016-12-06,will the tale of two steves end in warfare wit...,white privilege,0.430556,0.569444,0.9631
...,...,...,...,...,...,...,...,...
3043,2020,NPR,2020-09-24,vehicle make their way along interstate 25 sou...,BLM,0.378378,0.621622,0.9917
3044,2020,NPR,2020-07-15,georgia sen kelly loeffler be a coowner of atl...,BLM,0.476793,0.523207,0.5089
3045,2020,NPR,2020-05-28,because of the pandemic the bureau of land man...,BLM,0.470968,0.529032,0.9187
3046,2020,NPR,2020-08-10,western gop senator face tough reelection bid ...,BLM,0.562500,0.437500,-0.6643


In [32]:
test_df = pd.concat([abortion_all_sentiment, race_all_sentiment], ignore_index=True, keys=['abortion', 'Race'])
test_df
# This is working can remove before final upload

Unnamed: 0,Year,Source,Published,Content,Abortion,Norm_Neg,Norm_Pos,Compound,Race
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,0.666667,0.333333,-0.9989,
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,0.455696,0.544304,0.8768,
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,0.595890,0.404110,-0.9698,
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,0.410448,0.589552,0.9830,
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,0.301980,0.698020,0.9893,
...,...,...,...,...,...,...,...,...,...
4939,2020,NPR,2020-09-24,vehicle make their way along interstate 25 sou...,,0.378378,0.621622,0.9917,BLM
4940,2020,NPR,2020-07-15,georgia sen kelly loeffler be a coowner of atl...,,0.476793,0.523207,0.5089,BLM
4941,2020,NPR,2020-05-28,because of the pandemic the bureau of land man...,,0.470968,0.529032,0.9187,BLM
4942,2020,NPR,2020-08-10,western gop senator face tough reelection bid ...,,0.562500,0.437500,-0.6643,BLM


# Load Immigration articles

In [33]:
# Define a dictionary to map month names to their numeric values
month_dict = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Define the conversion function
def convert_article(article):
    source = article['source']
    
    published_date = pd.NaT  # Initialize with a "Not a Time" value
    if 'published_date' in article:
        published_date_data = article['published_date']
        year = published_date_data.get('year')
        month = month_dict.get(published_date_data.get('month', 'January'), 1)
        day = published_date_data.get('day')
        if year is not None and month is not None and day is not None:
            try:
                published_date = pd.Timestamp(year=int(year), month=int(month), day=int(day))
            except ValueError:
                pass
            
    content = article['article_content']
    bias = article['bias']

    return {
        'Source': source,
        'Published': published_date,
        'Content': content,
        'Immigration': bias
    }

# Define a list to store the converted articles
all_articles = []

# Get a list of file paths matching the pattern
file_paths = glob.glob('immigration_articles/immigration_*.json')

# Loop through the file paths
for file_path in file_paths:
    # Load the JSON file
    with open(file_path) as file:
        json_data = json.load(file)
        articles = json_data['Articles']
        
        # Convert each article and append to the list
        for article in articles:
            converted_article = convert_article(article)
            all_articles.append(converted_article)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_articles)

# Convert the Published column to datetime
df['Published'] = pd.to_datetime(df['Published'])

# Fill missing month values in the Published column with 1
df['Published'].fillna(pd.to_datetime('2016-01-01'), inplace=True)

# Extract the year from the 'Published' column
df['Year'] = df['Published'].dt.year

# Print the resulting DataFrame
df

Unnamed: 0,Source,Published,Content,Immigration,Year
0,CBS,2016-11-09,The Canadian government website for immigratio...,immigration,2016
1,CBS,2016-11-18,CHICAGO — Immigration hotlines are buzzing. Le...,immigration,2016
2,CBS,2016-11-09,As Donald Trump pulled off a stunning upset to...,immigration,2016
3,CBS,2016-11-09,"After casting their votes, Americans headed ho...",immigration,2016
4,CBS,2016-12-08,BERLIN -- German Chancellor Angela Merkel’s pa...,immigration,2016
...,...,...,...,...,...
8212,NPR,2020-09-13,Maria Hinojosa has dedicated her career to tel...,xenophobia,2020
8213,NPR,2020-01-26,The phone messaging app WhatsApp is beloved by...,xenophobia,2020
8214,NPR,2020-01-25,The senator and Democratic presidential hopefu...,xenophobia,2020
8215,NPR,2020-12-14,All four critics agreed that Michaela Coel's s...,xenophobia,2020


In [34]:
# Check for NaN values in the DataFrame
nan_rows = df[df.isna().any(axis=1)]

# Print the rows with NaN values
print(nan_rows)

Empty DataFrame
Columns: [Source, Published, Content, Immigration, Year]
Index: []


In [35]:
# Remove rows with empty "Content" column
df = df[df['Content'] != '']

# Print the updated DataFrame
df
# gives 

Unnamed: 0,Source,Published,Content,Immigration,Year
0,CBS,2016-11-09,The Canadian government website for immigratio...,immigration,2016
1,CBS,2016-11-18,CHICAGO — Immigration hotlines are buzzing. Le...,immigration,2016
2,CBS,2016-11-09,As Donald Trump pulled off a stunning upset to...,immigration,2016
3,CBS,2016-11-09,"After casting their votes, Americans headed ho...",immigration,2016
4,CBS,2016-12-08,BERLIN -- German Chancellor Angela Merkel’s pa...,immigration,2016
...,...,...,...,...,...
8212,NPR,2020-09-13,Maria Hinojosa has dedicated her career to tel...,xenophobia,2020
8213,NPR,2020-01-26,The phone messaging app WhatsApp is beloved by...,xenophobia,2020
8214,NPR,2020-01-25,The senator and Democratic presidential hopefu...,xenophobia,2020
8215,NPR,2020-12-14,All four critics agreed that Michaela Coel's s...,xenophobia,2020


In [36]:
# Check for duplicate rows with all column content the same
duplicate_rows = df.duplicated(keep=False)

# Print the duplicate rows
print(df[duplicate_rows])

     Source  Published                                            Content   
2001    CBS 2020-05-18  It was precisely 100 years ago that, coming of...  \
2002    CBS 2020-05-18  It was precisely 100 years ago that, coming of...   
4537    CNN 2019-06-27  Julián Castro walked onto the debate stage in ...   
4538    CNN 2019-06-27  Julián Castro walked onto the debate stage in ...   
4774    CNN 2019-10-15  The United States is a democracy, which means ...   
4775    CNN 2019-10-15  The United States is a democracy, which means ...   
4870    CNN 2019-09-18  Editor’s Note: Anushay Hossain is a political ...   
4871    CNN 2019-09-18  Editor’s Note: Anushay Hossain is a political ...   
6343    FOX 2020-10-01  Closeup of groom placing a wedding ring on the...   
6344    FOX 2020-10-01  Closeup of groom placing a wedding ring on the...   
6870    NPR 2016-02-25  Alfredo Trejo, 18, came to the U.S. from El Sa...   
6871    NPR 2016-02-25  Alfredo Trejo, 18, came to the U.S. from El Sa...   

In [37]:
# Remove duplicate rows when present
df = df.drop_duplicates()

# Print the DataFrame without duplicates
df
# gives 

Unnamed: 0,Source,Published,Content,Immigration,Year
0,CBS,2016-11-09,The Canadian government website for immigratio...,immigration,2016
1,CBS,2016-11-18,CHICAGO — Immigration hotlines are buzzing. Le...,immigration,2016
2,CBS,2016-11-09,As Donald Trump pulled off a stunning upset to...,immigration,2016
3,CBS,2016-11-09,"After casting their votes, Americans headed ho...",immigration,2016
4,CBS,2016-12-08,BERLIN -- German Chancellor Angela Merkel’s pa...,immigration,2016
...,...,...,...,...,...
8212,NPR,2020-09-13,Maria Hinojosa has dedicated her career to tel...,xenophobia,2020
8213,NPR,2020-01-26,The phone messaging app WhatsApp is beloved by...,xenophobia,2020
8214,NPR,2020-01-25,The senator and Democratic presidential hopefu...,xenophobia,2020
8215,NPR,2020-12-14,All four critics agreed that Michaela Coel's s...,xenophobia,2020


In [38]:
# Arrange columns: Move the Year column to the front
new_cols = ['Year', 'Source', 'Published', 'Content', 'Immigration']
immigration_data = df.reindex(columns=new_cols)
immigration_data
# gives 8013 rows 5 columns

Unnamed: 0,Year,Source,Published,Content,Immigration
0,2016,CBS,2016-11-09,The Canadian government website for immigratio...,immigration
1,2016,CBS,2016-11-18,CHICAGO — Immigration hotlines are buzzing. Le...,immigration
2,2016,CBS,2016-11-09,As Donald Trump pulled off a stunning upset to...,immigration
3,2016,CBS,2016-11-09,"After casting their votes, Americans headed ho...",immigration
4,2016,CBS,2016-12-08,BERLIN -- German Chancellor Angela Merkel’s pa...,immigration
...,...,...,...,...,...
8212,2020,NPR,2020-09-13,Maria Hinojosa has dedicated her career to tel...,xenophobia
8213,2020,NPR,2020-01-26,The phone messaging app WhatsApp is beloved by...,xenophobia
8214,2020,NPR,2020-01-25,The senator and Democratic presidential hopefu...,xenophobia
8215,2020,NPR,2020-12-14,All four critics agreed that Michaela Coel's s...,xenophobia


### Clean data. Estimated time:
85 minutes on 8013 immigration articles

In [39]:
# call cleaning functions
immigration_data['Content'] = immigration_data['Content'].apply(lambda i:word_correct(i))
immigration_data['Content'] = immigration_data['Content'].apply(lambda i:lemmatize_word(i))
print('Done')

Done


### Get sentiment scores

In [40]:
# Call sentiment function (this takes a few minutes)
immigration_sentiment = get_sentiment_scores(immigration_data)
immigration_sentiment

Unnamed: 0,Year,Source,Published,Content,Immigration,SentimentScores
0,2016,CBS,2016-11-09,the canadian government website for immigratio...,immigration,"{'neg': 0.034, 'pos': 0.095}"
1,2016,CBS,2016-11-18,chicago immigration hotlines be buzzing legal ...,immigration,"{'neg': 0.044, 'pos': 0.062}"
2,2016,CBS,2016-11-09,a donald trump pull off a stun upset to win th...,immigration,"{'neg': 0.068, 'pos': 0.222}"
3,2016,CBS,2016-11-09,after cast their vote american head home on tu...,immigration,"{'neg': 0.093, 'pos': 0.048}"
4,2016,CBS,2016-12-08,berlin german chancellor angela merkels party ...,immigration,"{'neg': 0.055, 'pos': 0.144}"
...,...,...,...,...,...,...
8212,2020,NPR,2020-09-13,maria hinojosa have dedicate her career to tel...,xenophobia,"{'neg': 0.054, 'pos': 0.064}"
8213,2020,NPR,2020-01-26,the phone message app whatsapp be beloved by m...,xenophobia,"{'neg': 0.117, 'pos': 0.101}"
8214,2020,NPR,2020-01-25,the senator and democratic presidential hopefu...,xenophobia,"{'neg': 0.055, 'pos': 0.16}"
8215,2020,NPR,2020-12-14,all four critic agree that michaela coels seri...,xenophobia,"{'neg': 0.102, 'pos': 0.156}"


In [41]:
# Extract the sentiment scores from the 'SentimentScores' column to normalize them
immigration_sentiment['Norm_Neg'] = immigration_sentiment['SentimentScores'].apply(lambda x: x['neg'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
immigration_sentiment['Norm_Pos'] = immigration_sentiment['SentimentScores'].apply(lambda x: x['pos'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
immigration_sentiment

Unnamed: 0,Year,Source,Published,Content,Immigration,SentimentScores,Norm_Neg,Norm_Pos
0,2016,CBS,2016-11-09,the canadian government website for immigratio...,immigration,"{'neg': 0.034, 'pos': 0.095}",0.263566,0.736434
1,2016,CBS,2016-11-18,chicago immigration hotlines be buzzing legal ...,immigration,"{'neg': 0.044, 'pos': 0.062}",0.415094,0.584906
2,2016,CBS,2016-11-09,a donald trump pull off a stun upset to win th...,immigration,"{'neg': 0.068, 'pos': 0.222}",0.234483,0.765517
3,2016,CBS,2016-11-09,after cast their vote american head home on tu...,immigration,"{'neg': 0.093, 'pos': 0.048}",0.659574,0.340426
4,2016,CBS,2016-12-08,berlin german chancellor angela merkels party ...,immigration,"{'neg': 0.055, 'pos': 0.144}",0.276382,0.723618
...,...,...,...,...,...,...,...,...
8212,2020,NPR,2020-09-13,maria hinojosa have dedicate her career to tel...,xenophobia,"{'neg': 0.054, 'pos': 0.064}",0.457627,0.542373
8213,2020,NPR,2020-01-26,the phone message app whatsapp be beloved by m...,xenophobia,"{'neg': 0.117, 'pos': 0.101}",0.536697,0.463303
8214,2020,NPR,2020-01-25,the senator and democratic presidential hopefu...,xenophobia,"{'neg': 0.055, 'pos': 0.16}",0.255814,0.744186
8215,2020,NPR,2020-12-14,all four critic agree that michaela coels seri...,xenophobia,"{'neg': 0.102, 'pos': 0.156}",0.395349,0.604651


In [42]:
# run compound sentiment on All articles (this takes a few minutes)
immigration_all_sentiment = get_compound_sentiment(immigration_sentiment)
# Drop Sentiment Scores column
immigration_all_sentiment = immigration_sentiment.drop(['SentimentScores'], axis=1)
immigration_all_sentiment

Unnamed: 0,Year,Source,Published,Content,Immigration,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-09,the canadian government website for immigratio...,immigration,0.263566,0.736434,{'compound': 0.8934}
1,2016,CBS,2016-11-18,chicago immigration hotlines be buzzing legal ...,immigration,0.415094,0.584906,{'compound': 0.8447}
2,2016,CBS,2016-11-09,a donald trump pull off a stun upset to win th...,immigration,0.234483,0.765517,{'compound': 1.0}
3,2016,CBS,2016-11-09,after cast their vote american head home on tu...,immigration,0.659574,0.340426,{'compound': -0.982}
4,2016,CBS,2016-12-08,berlin german chancellor angela merkels party ...,immigration,0.276382,0.723618,{'compound': 0.9981}
...,...,...,...,...,...,...,...,...
8212,2020,NPR,2020-09-13,maria hinojosa have dedicate her career to tel...,xenophobia,0.457627,0.542373,{'compound': 0.9406}
8213,2020,NPR,2020-01-26,the phone message app whatsapp be beloved by m...,xenophobia,0.536697,0.463303,{'compound': -0.9744}
8214,2020,NPR,2020-01-25,the senator and democratic presidential hopefu...,xenophobia,0.255814,0.744186,{'compound': 0.9991}
8215,2020,NPR,2020-12-14,all four critic agree that michaela coels seri...,xenophobia,0.395349,0.604651,{'compound': 0.9998}


In [43]:
# Extract to compound value from the dictionary
# Update the "Compound" column
immigration_all_sentiment['Compound'] = immigration_all_sentiment['Compound'].apply(lambda x: x['compound'])

# Convert the "Compound" column to a numeric data type
immigration_all_sentiment['Compound'] = pd.to_numeric(immigration_all_sentiment['Compound'])

# Print the updated DataFrame
immigration_all_sentiment
# gives 8013 rows × 8 columns

Unnamed: 0,Year,Source,Published,Content,Immigration,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-09,the canadian government website for immigratio...,immigration,0.263566,0.736434,0.8934
1,2016,CBS,2016-11-18,chicago immigration hotlines be buzzing legal ...,immigration,0.415094,0.584906,0.8447
2,2016,CBS,2016-11-09,a donald trump pull off a stun upset to win th...,immigration,0.234483,0.765517,1.0000
3,2016,CBS,2016-11-09,after cast their vote american head home on tu...,immigration,0.659574,0.340426,-0.9820
4,2016,CBS,2016-12-08,berlin german chancellor angela merkels party ...,immigration,0.276382,0.723618,0.9981
...,...,...,...,...,...,...,...,...
8212,2020,NPR,2020-09-13,maria hinojosa have dedicate her career to tel...,xenophobia,0.457627,0.542373,0.9406
8213,2020,NPR,2020-01-26,the phone message app whatsapp be beloved by m...,xenophobia,0.536697,0.463303,-0.9744
8214,2020,NPR,2020-01-25,the senator and democratic presidential hopefu...,xenophobia,0.255814,0.744186,0.9991
8215,2020,NPR,2020-12-14,all four critic agree that michaela coels seri...,xenophobia,0.395349,0.604651,0.9998


# Load Socio_Eco articles

In [44]:
# Define a dictionary to map month names to their numeric values
month_dict = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Define the conversion function
def convert_article(article):
    source = article['source']
    
    published_date = pd.NaT  # Initialize with a "Not a Time" value
    if 'published_date' in article:
        published_date_data = article['published_date']
        year = published_date_data.get('year')
        month = month_dict.get(published_date_data.get('month', 'January'), 1)
        day = published_date_data.get('day')
        if year is not None and month is not None and day is not None:
            try:
                published_date = pd.Timestamp(year=int(year), month=int(month), day=int(day))
            except ValueError:
                pass
            
    content = article['article_content']
    bias = article['bias']

    return {
        'Source': source,
        'Published': published_date,
        'Content': content,
        'Socio_Eco': bias
    }

# Define a list to store the converted articles
all_articles = []

# Get a list of file paths matching the pattern
file_paths = glob.glob('socioeco_articles/socioeconomic_*.json')

# Loop through the file paths
for file_path in file_paths:
    # Load the JSON file
    with open(file_path) as file:
        json_data = json.load(file)
        articles = json_data['Articles']
        
        # Convert each article and append to the list
        for article in articles:
            converted_article = convert_article(article)
            all_articles.append(converted_article)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_articles)

# Convert the Published column to datetime
df['Published'] = pd.to_datetime(df['Published'])

# Fill missing month values in the Published column with 1
df['Published'].fillna(pd.to_datetime('2016-01-01'), inplace=True)

# Extract the year from the 'Published' column
df['Year'] = df['Published'].dt.year

# Print the resulting DataFrame
df

Unnamed: 0,Source,Published,Content,Socio_Eco,Year
0,CBS,2016-07-11,It has long been known that high-income people...,socioeconomic,2016
1,CBS,2016-11-10,"MENOMONIE, Wis. -- On election night, when Don...",poverty line,2016
2,CBS,2016-11-17,Vice President Joe Biden talked about why Demo...,working class,2016
3,CBS,2016-11-10,Sen. Bernie Sanders on Wednesday said he’s wil...,working class,2016
4,CBS,2016-11-09,NEW YORK -- The impending presidency of Donald...,working class,2016
...,...,...,...,...,...
2987,NPR,2020-01-29,"Overall, U.S. health spending is more than twi...",medicare,2020
2988,NPR,2020-01-24,Democratic senators were off the campaign trai...,medicare,2020
2989,NPR,2020-01-22,"Former Vice President Joe Biden, left, and Ver...",medicare,2020
2990,NPR,2020-01-14,As Democratic presidential candidates prepare ...,medicare,2020


In [45]:
# Check for NaN values in the DataFrame
nan_rows = df[df.isna().any(axis=1)]

# Print the rows with NaN values
print(nan_rows)

Empty DataFrame
Columns: [Source, Published, Content, Socio_Eco, Year]
Index: []


In [46]:
# Remove rows with empty "Content" column
df = df[df['Content'] != '']

# Print the updated DataFrame
df
# gives 

Unnamed: 0,Source,Published,Content,Socio_Eco,Year
0,CBS,2016-07-11,It has long been known that high-income people...,socioeconomic,2016
1,CBS,2016-11-10,"MENOMONIE, Wis. -- On election night, when Don...",poverty line,2016
2,CBS,2016-11-17,Vice President Joe Biden talked about why Demo...,working class,2016
3,CBS,2016-11-10,Sen. Bernie Sanders on Wednesday said he’s wil...,working class,2016
4,CBS,2016-11-09,NEW YORK -- The impending presidency of Donald...,working class,2016
...,...,...,...,...,...
2987,NPR,2020-01-29,"Overall, U.S. health spending is more than twi...",medicare,2020
2988,NPR,2020-01-24,Democratic senators were off the campaign trai...,medicare,2020
2989,NPR,2020-01-22,"Former Vice President Joe Biden, left, and Ver...",medicare,2020
2990,NPR,2020-01-14,As Democratic presidential candidates prepare ...,medicare,2020


In [47]:
# Check for duplicate rows with all column content the same
duplicate_rows = df.duplicated(keep=False)

# Print the duplicate rows
print(df[duplicate_rows])

     Source  Published                                            Content   
498     CBS 2020-02-24  Bernie Sanders had an impressive win last nigh...  \
499     CBS 2020-02-24  Bernie Sanders had an impressive win last nigh...   
1513    CNN 2020-03-04  A reinvigorated Joe Biden has a nine state Sup...   
1515    CNN 2020-03-04  A reinvigorated Joe Biden has a nine state Sup...   

          Socio_Eco  Year  
498   working class  2020  
499   working class  2020  
1513  working class  2020  
1515  working class  2020  


In [48]:
# Remove duplicate rows when present
df = df.drop_duplicates()

# Print the DataFrame without duplicates
df
# gives 2957 articles

Unnamed: 0,Source,Published,Content,Socio_Eco,Year
0,CBS,2016-07-11,It has long been known that high-income people...,socioeconomic,2016
1,CBS,2016-11-10,"MENOMONIE, Wis. -- On election night, when Don...",poverty line,2016
2,CBS,2016-11-17,Vice President Joe Biden talked about why Demo...,working class,2016
3,CBS,2016-11-10,Sen. Bernie Sanders on Wednesday said he’s wil...,working class,2016
4,CBS,2016-11-09,NEW YORK -- The impending presidency of Donald...,working class,2016
...,...,...,...,...,...
2987,NPR,2020-01-29,"Overall, U.S. health spending is more than twi...",medicare,2020
2988,NPR,2020-01-24,Democratic senators were off the campaign trai...,medicare,2020
2989,NPR,2020-01-22,"Former Vice President Joe Biden, left, and Ver...",medicare,2020
2990,NPR,2020-01-14,As Democratic presidential candidates prepare ...,medicare,2020


In [50]:
# Arrange columns: Move the Year column to the front
new_cols = ['Year', 'Source', 'Published', 'Content', 'Socio_Eco']
socio_data = df.reindex(columns=new_cols)
socio_data
# gives 2957 rows 5 columns

Unnamed: 0,Year,Source,Published,Content,Socio_Eco
0,2016,CBS,2016-07-11,It has long been known that high-income people...,socioeconomic
1,2016,CBS,2016-11-10,"MENOMONIE, Wis. -- On election night, when Don...",poverty line
2,2016,CBS,2016-11-17,Vice President Joe Biden talked about why Demo...,working class
3,2016,CBS,2016-11-10,Sen. Bernie Sanders on Wednesday said he’s wil...,working class
4,2016,CBS,2016-11-09,NEW YORK -- The impending presidency of Donald...,working class
...,...,...,...,...,...
2987,2020,NPR,2020-01-29,"Overall, U.S. health spending is more than twi...",medicare
2988,2020,NPR,2020-01-24,Democratic senators were off the campaign trai...,medicare
2989,2020,NPR,2020-01-22,"Former Vice President Joe Biden, left, and Ver...",medicare
2990,2020,NPR,2020-01-14,As Democratic presidential candidates prepare ...,medicare


### Clean the data: Estimated time
33 minutes on 2957 socio economic articles

In [51]:
# call cleaning functions
socio_data['Content'] = socio_data['Content'].apply(lambda i:word_correct(i))
socio_data['Content'] = socio_data['Content'].apply(lambda i:lemmatize_word(i))
print('Done')

Done


### Get sentiment scores

In [52]:
# Call sentiment function (This takes a few minutes)
socio_sentiment = get_sentiment_scores(socio_data)
socio_sentiment

Unnamed: 0,Year,Source,Published,Content,Socio_Eco,SentimentScores
0,2016,CBS,2016-07-11,it have long be know that highincome people in...,socioeconomic,"{'neg': 0.167, 'pos': 0.102}"
1,2016,CBS,2016-11-10,menomonie wi on election night when donald tru...,poverty line,"{'neg': 0.074, 'pos': 0.134}"
2,2016,CBS,2016-11-17,vice president joe biden talk about why democr...,working class,"{'neg': 0.058, 'pos': 0.075}"
3,2016,CBS,2016-11-10,sen bernie sander on wednesday say he willing ...,working class,"{'neg': 0.095, 'pos': 0.153}"
4,2016,CBS,2016-11-09,new york the impend presidency of donald trump...,working class,"{'neg': 0.071, 'pos': 0.183}"
...,...,...,...,...,...,...
2987,2020,NPR,2020-01-29,overall u health spending be more than twice t...,medicare,"{'neg': 0.053, 'pos': 0.126}"
2988,2020,NPR,2020-01-24,democratic senator be off the campaign trail w...,medicare,"{'neg': 0.035, 'pos': 0.129}"
2989,2020,NPR,2020-01-22,former vice president joe biden left and vermo...,medicare,"{'neg': 0.119, 'pos': 0.119}"
2990,2020,NPR,2020-01-14,a democratic presidential candidate prepare to...,medicare,"{'neg': 0.028, 'pos': 0.092}"


In [53]:
# Extract the sentiment scores from the 'SentimentScores' column to normalize them
socio_sentiment['Norm_Neg'] = socio_sentiment['SentimentScores'].apply(lambda x: x['neg'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
socio_sentiment['Norm_Pos'] = socio_sentiment['SentimentScores'].apply(lambda x: x['pos'] / (x['neg'] + x['pos']) if (x['neg'] + x['pos']) != 0 else 0)
socio_sentiment

Unnamed: 0,Year,Source,Published,Content,Socio_Eco,SentimentScores,Norm_Neg,Norm_Pos
0,2016,CBS,2016-07-11,it have long be know that highincome people in...,socioeconomic,"{'neg': 0.167, 'pos': 0.102}",0.620818,0.379182
1,2016,CBS,2016-11-10,menomonie wi on election night when donald tru...,poverty line,"{'neg': 0.074, 'pos': 0.134}",0.355769,0.644231
2,2016,CBS,2016-11-17,vice president joe biden talk about why democr...,working class,"{'neg': 0.058, 'pos': 0.075}",0.436090,0.563910
3,2016,CBS,2016-11-10,sen bernie sander on wednesday say he willing ...,working class,"{'neg': 0.095, 'pos': 0.153}",0.383065,0.616935
4,2016,CBS,2016-11-09,new york the impend presidency of donald trump...,working class,"{'neg': 0.071, 'pos': 0.183}",0.279528,0.720472
...,...,...,...,...,...,...,...,...
2987,2020,NPR,2020-01-29,overall u health spending be more than twice t...,medicare,"{'neg': 0.053, 'pos': 0.126}",0.296089,0.703911
2988,2020,NPR,2020-01-24,democratic senator be off the campaign trail w...,medicare,"{'neg': 0.035, 'pos': 0.129}",0.213415,0.786585
2989,2020,NPR,2020-01-22,former vice president joe biden left and vermo...,medicare,"{'neg': 0.119, 'pos': 0.119}",0.500000,0.500000
2990,2020,NPR,2020-01-14,a democratic presidential candidate prepare to...,medicare,"{'neg': 0.028, 'pos': 0.092}",0.233333,0.766667


In [54]:
# run compound sentiment on All articles (This takes a few minutes)
socio_all_sentiment = get_compound_sentiment(socio_sentiment)
# Drop Sentiment Scores column
socio_all_sentiment = socio_sentiment.drop(['SentimentScores'], axis=1)
socio_all_sentiment

Unnamed: 0,Year,Source,Published,Content,Socio_Eco,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-07-11,it have long be know that highincome people in...,socioeconomic,0.620818,0.379182,{'compound': -0.9988}
1,2016,CBS,2016-11-10,menomonie wi on election night when donald tru...,poverty line,0.355769,0.644231,{'compound': 0.9973}
2,2016,CBS,2016-11-17,vice president joe biden talk about why democr...,working class,0.436090,0.563910,{'compound': -0.0272}
3,2016,CBS,2016-11-10,sen bernie sander on wednesday say he willing ...,working class,0.383065,0.616935,{'compound': 0.9777}
4,2016,CBS,2016-11-09,new york the impend presidency of donald trump...,working class,0.279528,0.720472,{'compound': 0.946}
...,...,...,...,...,...,...,...,...
2987,2020,NPR,2020-01-29,overall u health spending be more than twice t...,medicare,0.296089,0.703911,{'compound': 0.9981}
2988,2020,NPR,2020-01-24,democratic senator be off the campaign trail w...,medicare,0.213415,0.786585,{'compound': 0.9996}
2989,2020,NPR,2020-01-22,former vice president joe biden left and vermo...,medicare,0.500000,0.500000,{'compound': -0.8856}
2990,2020,NPR,2020-01-14,a democratic presidential candidate prepare to...,medicare,0.233333,0.766667,{'compound': 0.994}


In [55]:
# Extract to compound value from the dictionary
# Update the "Compound" column
socio_all_sentiment['Compound'] = socio_all_sentiment['Compound'].apply(lambda x: x['compound'])

# Convert the "Compound" column to a numeric data type
socio_all_sentiment['Compound'] = pd.to_numeric(socio_all_sentiment['Compound'])

# Print the updated DataFrame
socio_all_sentiment
# gives 2957 rows × 8 columns

Unnamed: 0,Year,Source,Published,Content,Socio_Eco,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-07-11,it have long be know that highincome people in...,socioeconomic,0.620818,0.379182,-0.9988
1,2016,CBS,2016-11-10,menomonie wi on election night when donald tru...,poverty line,0.355769,0.644231,0.9973
2,2016,CBS,2016-11-17,vice president joe biden talk about why democr...,working class,0.436090,0.563910,-0.0272
3,2016,CBS,2016-11-10,sen bernie sander on wednesday say he willing ...,working class,0.383065,0.616935,0.9777
4,2016,CBS,2016-11-09,new york the impend presidency of donald trump...,working class,0.279528,0.720472,0.9460
...,...,...,...,...,...,...,...,...
2987,2020,NPR,2020-01-29,overall u health spending be more than twice t...,medicare,0.296089,0.703911,0.9981
2988,2020,NPR,2020-01-24,democratic senator be off the campaign trail w...,medicare,0.213415,0.786585,0.9996
2989,2020,NPR,2020-01-22,former vice president joe biden left and vermo...,medicare,0.500000,0.500000,-0.8856
2990,2020,NPR,2020-01-14,a democratic presidential candidate prepare to...,medicare,0.233333,0.766667,0.9940


### Combine all the dataframes before loading into postgress

In [56]:
# combine all articles in one df
combine_df = pd.concat([abortion_all_sentiment, race_all_sentiment, immigration_all_sentiment, socio_all_sentiment], ignore_index=True, keys=['Abortion', 'Race', 'Immigration', 'Socio_Eco'])
combine_df

Unnamed: 0,Year,Source,Published,Content,Abortion,Norm_Neg,Norm_Pos,Compound,Race,Immigration,Socio_Eco
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,0.666667,0.333333,-0.9989,,,
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,0.455696,0.544304,0.8768,,,
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,0.595890,0.404110,-0.9698,,,
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,0.410448,0.589552,0.9830,,,
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,0.301980,0.698020,0.9893,,,
...,...,...,...,...,...,...,...,...,...,...,...
15909,2020,NPR,2020-01-29,overall u health spending be more than twice t...,,0.296089,0.703911,0.9981,,,medicare
15910,2020,NPR,2020-01-24,democratic senator be off the campaign trail w...,,0.213415,0.786585,0.9996,,,medicare
15911,2020,NPR,2020-01-22,former vice president joe biden left and vermo...,,0.500000,0.500000,-0.8856,,,medicare
15912,2020,NPR,2020-01-14,a democratic presidential candidate prepare to...,,0.233333,0.766667,0.9940,,,medicare


In [58]:
# Arrange columns: Move the Year column to the front
new_cols = ['Year', 'Source', 'Published', 'Content', 'Abortion', 'Race', 'Immigration', 'Socio_Eco', 'Norm_Neg', 'Norm_Pos', 'Compound']
articles_df = combine_df.reindex(columns=new_cols)
articles_df
# gives 15914 rows 5 columns

Unnamed: 0,Year,Source,Published,Content,Abortion,Race,Immigration,Socio_Eco,Norm_Neg,Norm_Pos,Compound
0,2016,CBS,2016-11-16,in the wake of the election the antidefamation...,planned parenthood,,,,0.666667,0.333333,-0.9989
1,2016,CBS,2016-12-15,u intelligence source be not just point the fi...,planned parenthood,,,,0.455696,0.544304,0.8768
2,2017,CBS,2017-06-20,tuesday special election in georgia 6th congre...,planned parenthood,,,,0.595890,0.404110,-0.9698
3,2017,CBS,2017-06-22,senate republican unveiled a discussion draft ...,planned parenthood,,,,0.410448,0.589552,0.9830
4,2017,CBS,2017-04-04,chelsea clinton on tuesday express doubt that ...,planned parenthood,,,,0.301980,0.698020,0.9893
...,...,...,...,...,...,...,...,...,...,...,...
15909,2020,NPR,2020-01-29,overall u health spending be more than twice t...,,,,medicare,0.296089,0.703911,0.9981
15910,2020,NPR,2020-01-24,democratic senator be off the campaign trail w...,,,,medicare,0.213415,0.786585,0.9996
15911,2020,NPR,2020-01-22,former vice president joe biden left and vermo...,,,,medicare,0.500000,0.500000,-0.8856
15912,2020,NPR,2020-01-14,a democratic presidential candidate prepare to...,,,,medicare,0.233333,0.766667,0.9940


# Load into PostgresSQL

In [59]:
# Replace the values below with your database credentials
DATABASE = "sentimentdb"
USER = "postgres"
PASSWORD = "Spanques&Gum6"
HOST = "localhost"
PORT = "5432"

# Connect to your PostgreSQL database
conn = psycopg2.connect(
    database=DATABASE,
    user=USER,
    password=PASSWORD,
    host=HOST,
    port=PORT
)

# Create a sqlalchemy engine
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}")

articles_df.to_sql("articles", engine, if_exists="append", index=False)
print("Article table loaded successfully")

# Commit the changes to the database
conn.commit()

# Close the database connection
conn.close()

Article table loaded successfully


## View the data

In [61]:
# Replace the values below with your database credentials
DATABASE = "sentimentdb"
USER = "postgres"
PASSWORD = "Spanques&Gum6"
HOST = "localhost"
PORT = "5432"

# Connect to the PostgreSQL database
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASSWORD, host=HOST, port=PORT)

# Query the table and store the results in a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM articles", conn)

# Print the dataframe to verify that it contains data
print(df)

# Close the database connection
conn.close()
# gives [15914 rows x 11 columns] for all articles

  df = pd.read_sql_query("SELECT * FROM articles", conn)


       Year Source  Published   
0      2016    CBS 2016-11-16  \
1      2016    CBS 2016-12-15   
2      2017    CBS 2017-06-20   
3      2017    CBS 2017-06-22   
4      2017    CBS 2017-04-04   
...     ...    ...        ...   
15909  2020    NPR 2020-04-13   
15910  2020    NPR 2020-04-11   
15911  2020    NPR 2020-04-09   
15912  2020    NPR 2020-04-08   
15913  2020    NPR 2020-04-08   

                                                 Content            Abortion   
0      in the wake of the election the antidefamation...  planned parenthood  \
1      u intelligence source be not just point the fi...  planned parenthood   
2      tuesday special election in georgia 6th congre...  planned parenthood   
3      senate republican unveiled a discussion draft ...  planned parenthood   
4      chelsea clinton on tuesday express doubt that ...  planned parenthood   
...                                                  ...                 ...   
15909  sen bernie sander endorse his former

# Postgress Queries

To view the articles in postgres, it is advised to remove the 'Content' column since the articles are so long. The query might go like this.  
<code>
SELECT a."Year", a."Source", a."Published", a."Abortion", a."Norm_Neg", a."Norm_Pos", a."Compound"
FROM articles AS a
LIMIT 5;
</code>  

It is necessary to put the column name in quotations because they are capitalized column names.  
a."Year", a."Source", a."Published", a."Abortion", a."Race", a."Immigration", a."Socio_Eco", a."Norm_Neg", a."Norm_Pos", a."Compound"

The sentimentdb is 86 MB and the electiondb is 11 MB. Use this code to view size.  
<code>SELECT pg_size_pretty(pg_database_size('sentimentdb')) AS size;</code>
  
The articles table has 15914 rows.  
<code>SELECT COUNT(\*) FROM articles;</code>   

The twitter table is made from several notebooks organized by years. 
The twitter table has 21871 rows.   
<code>SELECT COUNT(\*) FROM twitter;</code>  