# importing Data and Loading Submission File

In [1]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\PresidentialRaceMemes_submissions'

# Call the function to read the zst folder into a DataFra
df_submissions = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


# Now you can work with the DataFrame (df)
df_submissions = df_submissions[['author', 'id','name', 'created_utc', 'subreddit_id', 'permalink', 'title', 'selftext', 'url', 'retrieved_on']]
df_submissions.head()

  from pandas.core import (


Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
0,[deleted],c6faxh,,1561693685,t5_17rom5,/r/PresidentialRaceMemes/comments/c6faxh/presidentialracememes_has_been_created/,PresidentialRaceMemes has been created,[deleted],https://www.reddit.com/r/PresidentialRaceMemes/comments/c6faxh/presidentialracememes_has_been_created/,1566774000.0
1,AlarmedScholar,c6fcl1,,1561693980,t5_17rom5,/r/PresidentialRaceMemes/comments/c6fcl1/no_one_marianne_williamson/,No one: ... Marianne Williamson:,,https://i.redd.it/8lpnaxncs0731.jpg,1566774000.0
2,AlarmedScholar,c6fefs,,1561694295,t5_17rom5,/r/PresidentialRaceMemes/comments/c6fefs/joe_biden_when_swalwell_asks_him_to_pass_the_torch/,Joe Biden when Swalwell asks him to pass the torch:,,https://i.redd.it/7yjwk9ydt0731.png,1566774000.0
3,AlarmedScholar,c6ffc7,,1561694448,t5_17rom5,/r/PresidentialRaceMemes/comments/c6ffc7/cocks_gun/,Cocks gun,,https://i.redd.it/6oxmbf9ut0731.jpg,1566774000.0
4,AlarmedScholar,c6ffz4,,1561694541,t5_17rom5,/r/PresidentialRaceMemes/comments/c6ffz4/williamson_with_the_wild_card/,Williamson with the wild card,,https://i.redd.it/akw72411u0731.jpg,1566774000.0


In [2]:
df_submissions.tail()

Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
39034,[deleted],zsyyli,t3_zsyyli,1671748489,t5_17rom5,/r/PresidentialRaceMemes/comments/zsyyli/message_from_your_whatever_you_wanna_call_this/,message from your whatever you wanna call this guy... not sure yet,,https://v.redd.it/45oxvfi11j7a1,1673176000.0
39035,Downtown-Dinner4996,zszeee,t3_zszeee,1671749624,t5_17rom5,/r/PresidentialRaceMemes/comments/zszeee/ah_ha_ha_ha/,Ah ha ha ha 😂😭,,https://i.redd.it/1t2gmgwe4j7a1.png,1673176000.0
39036,CorrectTank9957,ztciqs,t3_ztciqs,1671792871,t5_17rom5,/r/PresidentialRaceMemes/comments/ztciqs/democrat_voters_uhhh_guys_how_did_we_end_up_here/,"Democrat voters: uhhh, guys, how did we end up here?",,https://i.redd.it/0mshmg20pm7a1.png,1673176000.0
39037,ForceAffectionate379,ztvw31,t3_ztvw31,1671838273,t5_17rom5,/r/PresidentialRaceMemes/comments/ztvw31/if_joe_somehow_wins_i_will_drink_a_smoothie_full/,If Joe somehow wins I will drink a smoothie full of Corn pops and Sriracha,,https://i.redd.it/kp8o76e0gq7a1.png,1673175000.0
39038,[deleted],zv8vx3,t3_zv8vx3,1672008933,t5_17rom5,/r/PresidentialRaceMemes/comments/zv8vx3/weed/,Weed,[removed],,1673174000.0


In [3]:
df_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39039 entries, 0 to 39038
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        39039 non-null  object 
 1   id            39039 non-null  object 
 2   name          17022 non-null  object 
 3   created_utc   39039 non-null  int64  
 4   subreddit_id  39039 non-null  object 
 5   permalink     39039 non-null  object 
 6   title         39039 non-null  object 
 7   selftext      39039 non-null  object 
 8   url           39031 non-null  object 
 9   retrieved_on  22560 non-null  float64
dtypes: float64(1), int64(1), object(8)
memory usage: 3.0+ MB


# Importing Data and Loading Comment File

In [4]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\PresidentialRaceMemes_comments'
# Call the function to read the zst folder into a DataFrame
df_comments = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# Now you can work with the DataFrame (df)
# df_comments.head()
df_comments = df_comments[['id', 'created_utc', 'subreddit', 'link_id', 'name', 'subreddit_id',
       'parent_id', 'retrieved_on', 'author', 'body', 'score']]
df_comments.head()


Unnamed: 0,id,created_utc,subreddit,link_id,name,subreddit_id,parent_id,retrieved_on,author,body,score
0,es8ozgy,1561707049,PresidentialRaceMemes,t3_c6fmh7,,t5_17rom5,t3_c6fmh7,1571047000.0,AnthonyTops,I think he actually did pretty well,3
1,es8uahl,1561715161,PresidentialRaceMemes,t3_c6fwkr,,t5_17rom5,t3_c6fwkr,1571050000.0,rrr598,spicy meme,3
2,es8xwwv,1561720304,PresidentialRaceMemes,t3_c6fm0i,,t5_17rom5,t3_c6fm0i,1571051000.0,Kernel_Forbin,Including N1: https://i.imgur.com/yyfr3JB.png\n\nSource: https://www.washingtonpost.com/graphics/2019/politics/who-spoke-most-at-democratic-debate-june/?utm_term=.89c5a80af260,19
3,es8y6rr,1561720649,PresidentialRaceMemes,t3_c6fgox,,t5_17rom5,t3_c6fgox,1571052000.0,Philatelismisdead,"This is exactly what happened, I'm sure haha",79
4,es8zh8e,1561722145,PresidentialRaceMemes,t3_c6fkoy,,t5_17rom5,t3_c6fkoy,1571052000.0,ImproveEveryDay1982,he's actually a really good president show candidate the only problem is he's not presidential in the way that he carries himself,35


In [5]:
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840663 entries, 0 to 840662
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            840663 non-null  object 
 1   created_utc   840663 non-null  int64  
 2   subreddit     840663 non-null  object 
 3   link_id       840663 non-null  object 
 4   name          25670 non-null   object 
 5   subreddit_id  840663 non-null  object 
 6   parent_id     840663 non-null  object 
 7   retrieved_on  825261 non-null  float64
 8   author        840663 non-null  object 
 9   body          840663 non-null  object 
 10  score         840663 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 70.6+ MB


In [6]:
# Merge final_subs_df and final_comments_df on "name" and "link_id"
df = df_submissions.merge(df_comments, how='inner', left_on='name', right_on='link_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389592 entries, 0 to 389591
Data columns (total 21 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   author_x        389592 non-null  object 
 1   id_x            389592 non-null  object 
 2   name_x          389592 non-null  object 
 3   created_utc_x   389592 non-null  int64  
 4   subreddit_id_x  389592 non-null  object 
 5   permalink       389592 non-null  object 
 6   title           389592 non-null  object 
 7   selftext        389592 non-null  object 
 8   url             389582 non-null  object 
 9   retrieved_on_x  10091 non-null   float64
 10  id_y            389592 non-null  object 
 11  created_utc_y   389592 non-null  int64  
 12  subreddit       389592 non-null  object 
 13  link_id         389592 non-null  object 
 14  name_y          25625 non-null   object 
 15  subreddit_id_y  389592 non-null  object 
 16  parent_id       389592 non-null  object 
 17  retrieved_

In [7]:
final_df = df.copy()

# Convert timestamp columns to datetime format
timestamp_columns = ['created_utc_x', 'created_utc_y']
for col in timestamp_columns:
    final_df[col] = pd.to_datetime(final_df[col], unit='s')
    
final_df.head()    

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,retrieved_on_x,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,retrieved_on_y,author_y,body,score
0,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,,fp4b5ez,2020-05-01 01:09:03,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,1594725000.0,allan11011,But what about the other one?,20
1,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,,fp4b78g,2020-05-01 01:09:32,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,1594725000.0,HomephoneProductions,"I wish I didn't hate Trump, because I really want to be working on his campaign right now. There's so much stuff on Biden to go absolutely apeshit on him for.",36
2,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,,fp4sixx,2020-05-01 04:11:17,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,1594733000.0,dirigibalistic,An Obiden-BAMA Democrat,12
3,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,,fp50ceq,2020-05-01 05:52:19,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,1594737000.0,[deleted],[deleted],1
4,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,,fp58ln2,2020-05-01 08:01:51,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t1_fp4b78g,1594741000.0,Shopping_Penguin,Thats the problem with the two party system. Joe Biden and Trump are all beholden to their donors.\n\nIts a big club and you aren't in it jack.,19


In [8]:
final_df = final_df.drop(['retrieved_on_x', 'retrieved_on_y'], axis=1)

final_df.head()

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,author_y,body,score
0,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,fp4b5ez,2020-05-01 01:09:03,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,allan11011,But what about the other one?,20
1,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,fp4b78g,2020-05-01 01:09:32,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,HomephoneProductions,"I wish I didn't hate Trump, because I really want to be working on his campaign right now. There's so much stuff on Biden to go absolutely apeshit on him for.",36
2,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,fp4sixx,2020-05-01 04:11:17,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,dirigibalistic,An Obiden-BAMA Democrat,12
3,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,fp50ceq,2020-05-01 05:52:19,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t3_gb8ln1,[deleted],[deleted],1
4,Leaf-Currency,gb8ln1,t3_gb8ln1,2020-05-01 01:08:04,t5_17rom5,/r/PresidentialRaceMemes/comments/gb8ln1/joe_biden_is_joe_biden_says_joe_biden_trying_his/,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,https://i.redd.it/inhgujhsz1w41.jpg,fp58ln2,2020-05-01 08:01:51,PresidentialRaceMemes,t3_gb8ln1,,t5_17rom5,t1_fp4b78g,Shopping_Penguin,Thats the problem with the two party system. Joe Biden and Trump are all beholden to their donors.\n\nIts a big club and you aren't in it jack.,19


In [9]:
final_df = final_df.drop(['author_x', 'id_x', 'name_x', 'subreddit_id_x',
                          'permalink', 'url', 'id_y', 'name_y', 'subreddit_id_y',
                          'parent_id', 'author_y'], axis=1)


In [10]:
final_df.columns

Index(['created_utc_x', 'title', 'selftext', 'created_utc_y', 'subreddit',
       'link_id', 'body', 'score'],
      dtype='object')

In [11]:
import re
import pandas as pd

# Define the dictionary mapping variations to the standard form
name_variations = {
  r'\b(?:george\s+w\.\s+bush|bush|george(?:\s+w\.\s+bush)?)\b': 'George W. Bush',
  r'\b(?:barack\s+obama|obama|barack(?:\s+obama)?)\b': 'Barack Obama',
  r'\b(?:bill\s+clinton|clinton|bill(?:\s+clinton)?)\b': 'Bill Clinton',
  r'\b(?:donald\s+trump|trump|donald(?:\s+trump)?)\b': 'Donald Trump',
  r'\b(?:joe\s+biden|biden|joe(?:\s+biden)?)\b': 'Joe Biden',  
}


# Function to replace variations with standard names
def standardize_names(text):
    for pattern, standard_name in name_variations.items():
        text = re.sub(pattern, standard_name, text, flags=re.IGNORECASE)
    return text


In [12]:
# Import necessary libraries
import spacy
from spacy import displacy
import pandas as pd
import re
from textblob import TextBlob

# Download the small English model for spaCy
spacy.cli.download("en_core_web_sm")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract persons (already standardized)
def extract_persons(text):
    # Combine all standardized names into one pattern
    pattern = r'\b(?:' + '|'.join([re.escape(name) for name in name_variations.values()]) + r')\b'
    # Find all matches of persons' names in the text
    persons_found = re.findall(pattern, text, flags=re.IGNORECASE)
    # Return unique persons' names
    return list(set(persons_found))



# Apply the extract_persons function to 'title', 'selftext', and 'body' columns
final_df['persons_title'] = final_df['title'].apply(extract_persons)
final_df['persons_selftext'] = final_df['selftext'].apply(extract_persons)
final_df['persons_body'] = final_df['body'].apply(extract_persons)

# Filter the DataFrame to include only rows where either 'persons_title' or 'persons_selftext' or 'persons_body' is not empty
filtered_df = final_df[(final_df['persons_title'].apply(len) > 0) | 
                       (final_df['persons_selftext'].apply(len) > 0) |
                       (final_df['persons_body'].apply(len) > 0)]

# # Compute sentiments for 'title', 'selftext', and 'body'
# filtered_df['sentiment_title'] = filtered_df['title'].apply(get_sentiment)
# filtered_df['sentiment_selftext'] = filtered_df['selftext'].apply(get_sentiment)
# filtered_df['sentiment_body'] = filtered_df['body'].apply(get_sentiment)

# Display the filtered DataFrame with sentiments
filtered_df.head()


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,created_utc_x,title,selftext,created_utc_y,subreddit,link_id,body,score,persons_title,persons_selftext,persons_body
0,2020-05-01 01:08:04,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,2020-05-01 01:09:03,PresidentialRaceMemes,t3_gb8ln1,But what about the other one?,20,[Joe Biden],[],[]
1,2020-05-01 01:08:04,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,2020-05-01 01:09:32,PresidentialRaceMemes,t3_gb8ln1,"I wish I didn't hate Trump, because I really want to be working on his campaign right now. There's so much stuff on Biden to go absolutely apeshit on him for.",36,[Joe Biden],[],[]
2,2020-05-01 01:08:04,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,2020-05-01 04:11:17,PresidentialRaceMemes,t3_gb8ln1,An Obiden-BAMA Democrat,12,[Joe Biden],[],[]
3,2020-05-01 01:08:04,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,2020-05-01 05:52:19,PresidentialRaceMemes,t3_gb8ln1,[deleted],1,[Joe Biden],[],[]
4,2020-05-01 01:08:04,"""Joe Biden is Joe Biden"", says Joe Biden, trying his best not to forget",,2020-05-01 08:01:51,PresidentialRaceMemes,t3_gb8ln1,Thats the problem with the two party system. Joe Biden and Trump are all beholden to their donors.\n\nIts a big club and you aren't in it jack.,19,[Joe Biden],[],[Joe Biden]


In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13809 entries, 0 to 389565
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_utc_x     13809 non-null  datetime64[ns]
 1   title             13809 non-null  object        
 2   selftext          13809 non-null  object        
 3   created_utc_y     13809 non-null  datetime64[ns]
 4   subreddit         13809 non-null  object        
 5   link_id           13809 non-null  object        
 6   body              13809 non-null  object        
 7   score             13809 non-null  int64         
 8   persons_title     13809 non-null  object        
 9   persons_selftext  13809 non-null  object        
 10  persons_body      13809 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 1.3+ MB


In [14]:
filtered_df.to_csv('C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\PresidentialRaceMemes_ner.csv')