# importing Data and Loading Submission File

In [1]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\Presidentialpoll_submissions'

# Call the function to read the zst folder into a DataFra
df_submissions = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


# Now you can work with the DataFrame (df)
df_submissions = df_submissions[['author', 'id','name', 'created_utc', 'subreddit_id', 'permalink', 'title', 'selftext', 'url', 'retrieved_on']]
df_submissions.head()

  from pandas.core import (


Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
0,Sokol84,kii7hl,t3_kii7hl,1608684120,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,
1,SupremeLeader-Snoke,kiipl8,t3_kiipl8,1608685793,t5_3ls3yv,/r/Presidentialpoll/comments/kiipl8/most_corrupt_president/,Most corrupt president?,\n\n[View Poll](https://www.reddit.com/poll/kiipl8),https://www.reddit.com/r/Presidentialpoll/comments/kiipl8/most_corrupt_president/,
2,SupremeLeader-Snoke,kijx6b,t3_kijx6b,1608689907,t5_3ls3yv,/r/Presidentialpoll/comments/kijx6b/these_3_are_generally_considered_the_worst/,These 3 are generally considered the worst consecutive presidents ever but who is the least bad ( or good is you have a very unpopular opinion ),\n\n[View Poll](https://www.reddit.com/poll/kijx6b),https://www.reddit.com/r/Presidentialpoll/comments/kijx6b/these_3_are_generally_considered_the_worst/,
3,Sokol84,kikt5q,t3_kikt5q,1608693103,t5_3ls3yv,/r/Presidentialpoll/comments/kikt5q/other_posts/,Other posts,"Although Polls are the primary purpose for creating this subreddit, other types of posts are allowed too.",https://www.reddit.com/r/Presidentialpoll/comments/kikt5q/other_posts/,
4,SupremeLeader-Snoke,kil1ik,t3_kil1ik,1608693917,t5_3ls3yv,/r/Presidentialpoll/comments/kil1ik/best_president_that_prevented_civil_war/,Best president that prevented Civil War,Civil War almost happened a bunch of times before it happened and once afterwards. Which president handled the threat of civil war the best?\n\n[View Poll](https://www.reddit.com/poll/kil1ik),https://www.reddit.com/r/Presidentialpoll/comments/kil1ik/best_president_that_prevented_civil_war/,


In [2]:
df_submissions.tail()

Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
19224,Shamrock590602,zzxadk,t3_zzxadk,1672502762,t5_3ls3yv,/r/Presidentialpoll/comments/zzxadk/future_series_2032/,Future series 2032.,\n\n[View Poll](https://www.reddit.com/poll/zzxadk),https://www.reddit.com/r/Presidentialpoll/comments/zzxadk/future_series_2032/,1673168000.0
19225,Peacock-Raj,1000v9b,t3_1000v9b,1672512650,t5_3ls3yv,/r/Presidentialpoll/comments/1000v9b/william_morton_wheeler_1932_let_a_people/,William Morton Wheeler 1932! Let a people enlightened with the knowledge of the human superorganism overthrow the tyranny of the idle elements of society! | A House Divided,,https://freeimage.host/i/Hu7zc4s,1673168000.0
19226,Thunderousclaps,10035l2,t3_10035l2,1672519257,t5_3ls3yv,/r/Presidentialpoll/comments/10035l2/and_so_shall_our_liberty_be_secured_discord/,"And so, shall our liberty be secured. | Discord Alternate Elections","This time things went the wall General O'Mahoney wanted, after a relatively short debate with President Stephens and Secretary of war Sweeny he proposed to send troops from County Leitrim all the way up to Antrim, where Belfast is located, it was certainly a risky move, a gambit that could finish the war, but also one that was very dangerous, morale had certainly begun to lower among even the Orange Loyalists, but they still were proud supporters of the Crown, and while they certainly had fewer British soldiers in their territory, most had been transfered towards Dublin as the riots continued growing thanks to the continuous defeat of the British Army, the citizens in Ulster were certainly supportive of the Protestant United Kingdom, they were open on what they tought of the Catholic Republic lead by Stephens, they openly despised it and would probably never recognize it as their true homeland.\n\nNonetheless, they were Irish too, and the rebel leadership, while always aware of this fact, did believe that Ulster had to be freed from the hands of the Empire, even if the people of Ulster didn't think the British truly oppressed them and instead saw the rebels as nothing more than criminals...\n\nGeneral O'Mahoney would lead the rebel troops, if his plan were to be sucessful the entire war would soon end, and it would do it just before the 100th day since the beggining of the conflict, but he had to be careful, the forces of Ulster were going to be lead by a Conservative MP and war hero of the Crimean War, the **Liutenant General** **Thomas Henry Pakenham**, a native of County Antrim and a fervent opponent of the revolution, he and O'Mahoney would soon face each other in a battle that could possibly end the entire conflict, no one could deny that both men were aware of the stakes that this battle had, and just how much it could affect the future of Ireland and of Great Britain. \n\n\nThe forces lead by General John Francis O'Mahoney woulf first reach the towns of **Manorhamilton** on the the 9th of September and would reach **Kinlough** on the 10th. The first encounter between Republicans and Loyalists would take place on the 12th in the City of **Enniskillen**, right in front of **Lough Erne**, that encounter between 15.000 troops under O'Mahoney and 3.000 Loyalists lead by **Colonel Henry Arthur Cole**. The Republican forces under **Generals John Francis** **O****'Mahoney** and **John Charles O'Neill** would defeat Colonel Henry Arthur Cole, who saw his Loyalist forces lose around 900 men in battle, with a further 1300 surrendering as he and the rest tried to retreat towards **Irvinestown**.\n\nO'Mahoney would try to rest his troops while Cole continued with his retreat, additionally he began to shape a few different plans in case he had to face the forces of Ulster again in Irvinestown, on the other side of Lough Erne the Colonel tried to find a way to deal with the situation, given his clear numerical disadventage he was probably going to be forced into a retreat towards Belfast, where most Northern troops found themselves, and likely where the Republicans intended to go. \n\nUnder the orders of Colonel Henry Cole the British forces would start to retreat towards Belfast, and according to him it would be necessary for the fight to be as sucessful as possible for the Loyalist forces, as a defeat would solidify the power of the Irish Rebels, likely forcing the British Prime Minister, Palmerston, to accept a peace deal that granted Ireland it's independence. By the time they reached Belfast they saw that General Pakenham had been preparing the city with all the forces he had been able to find, the number of troops available to defend the city, counting both the forces that Pakenham himself had at his disposal, and the ones lead by Cole, would be of around 20.000 men, a clear adventage over O'Mahoney, however, many of those troops were unexperienced young soldiers who had enlisted when the rebellion began to turn specially problematic, and many troops were rather demoralized already, nonetheless, they could still win, that's certainly a fact.\n\nThe battle would finally take place on the 21st of September, being late September the climate was rather low, 54.3 Fahrenheit, the strategy, for the rebels, was to use their artillery to break down any barracks and fort that had been put up by the Loyalists, while the Loyalists would rely on the opposition that the Belfast citizens had over the idea of independence to try and force the Republicans to retreat, the battle would take for a longer time than expected, with the siege going for nearly 3 weeks, which was much more than expected for a battle in Ireland, and would finally come to and end with an amount of casualties that would shock both sides, nearly 17.000 people would perish, with 9841 Loyalists dying compared to 7159 Republicans, both the siege and the following battle would take countless lifes, and would leave Belfast as the city that had seen the bloodiest battle in Irish soil since the **Sack of Wexford** during the days of Cromwell, **with this battle having outdone the amount of casualties of Drogheda, Wexford and Rathmies all combined**, when the news reached the leaders of both nations, all that could be really tought was about how bloody the battle had been, and about the horror that Belfast had just experienced...\n\nThe loyalist forces would soon essentially collapse, they were absolutely unable to keep the Dublin riots under control, and with all the defeats, they were unable to know if they should continue fighting at all, the rebels now had one thing to do, they could either fully settle the conflict into the dust by advancing to Dublin, which was clearly not going to see a very major fight, most loyalists were essentially surrending already, or contact the British Prime Minister to talk for a peace deal, either way, Ireland had definetely won the war now.\n\n[View Poll](https://www.reddit.com/poll/10035l2)",https://www.reddit.com/r/Presidentialpoll/comments/10035l2/and_so_shall_our_liberty_be_secured_discord/,1673168000.0
19227,ShymArsenal-KZ,1004551,t3_1004551,1672522138,t5_3ls3yv,/r/Presidentialpoll/comments/1004551/franklin_buchanan_the_only_confederate_admiral/,"Franklin Buchanan, the only Confederate admiral.",,https://i.redd.it/0m7sh1nhxa9a1.jpg,1673168000.0
19228,spartachilles,1005ulv,t3_1005ulv,1672527227,t5_3ls3yv,/r/Presidentialpoll/comments/1005ulv/third_party_system_which_monetary_policy_would/,Third Party System: Which monetary policy would you have preferred?,\n\n[View Poll](https://www.reddit.com/poll/1005ulv),https://www.reddit.com/r/Presidentialpoll/comments/1005ulv/third_party_system_which_monetary_policy_would/,1673168000.0


In [3]:
df_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19229 entries, 0 to 19228
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        19229 non-null  object 
 1   id            19229 non-null  object 
 2   name          19229 non-null  object 
 3   created_utc   19229 non-null  int64  
 4   subreddit_id  19229 non-null  object 
 5   permalink     19229 non-null  object 
 6   title         19229 non-null  object 
 7   selftext      19229 non-null  object 
 8   url           19171 non-null  object 
 9   retrieved_on  14511 non-null  float64
dtypes: float64(1), int64(1), object(8)
memory usage: 1.5+ MB


# Importing Data and Loading Comment File

In [4]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\Presidentialpoll_comments'
# Call the function to read the zst folder into a DataFrame
df_comments = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# Now you can work with the DataFrame (df)
# df_comments.head()
df_comments = df_comments[['id', 'created_utc', 'subreddit', 'link_id', 'name', 'subreddit_id',
       'parent_id', 'retrieved_on', 'author', 'body', 'score']]
df_comments.head()


Unnamed: 0,id,created_utc,subreddit,link_id,name,subreddit_id,parent_id,retrieved_on,author,body,score
0,ggr59gz,1608687601,Presidentialpoll,t3_kiipl8,,t5_3ls3yv,t3_kiipl8,1619608000.0,SuperSonicSam619,"They were all terrible in terms of corruption, but Nixon was the most involved in that corruption out of all of them",3
1,ggrarh4,1608690656,Presidentialpoll,t3_kijx6b,,t5_3ls3yv,t3_kijx6b,1619610000.0,Sokol84,"Millard Fillmore. Ah, just saying his name makes me want to throw up.",2
2,ggraw7k,1608690733,Presidentialpoll,t3_kijx6b,,t5_3ls3yv,t3_kijx6b,1619610000.0,SuperSonicSam619,"Fillmore. Not great, but not even close to the level of horrible Pierce and Buchanan are",3
3,ggrfwo5,1608693636,Presidentialpoll,t3_kiipl8,,t5_3ls3yv,t3_kiipl8,1619613000.0,dutchboi2951,It's hard between Harding and Nixon IMO it's Harding,2
4,ggrg1mg,1608693717,Presidentialpoll,t3_kiipl8,,t5_3ls3yv,t1_ggrfwo5,1619613000.0,SupremeLeader-Snoke,Harding was literally just a tool used by big corporations during his entire presidency.,3


In [5]:
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150680 entries, 0 to 150679
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            150680 non-null  object 
 1   created_utc   150680 non-null  int64  
 2   subreddit     150680 non-null  object 
 3   link_id       150680 non-null  object 
 4   name          149057 non-null  object 
 5   subreddit_id  150680 non-null  object 
 6   parent_id     150680 non-null  object 
 7   retrieved_on  123265 non-null  float64
 8   author        150680 non-null  object 
 9   body          150680 non-null  object 
 10  score         150680 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 12.6+ MB


In [6]:
# Merge final_subs_df and final_comments_df on "name" and "link_id"
df = df_submissions.merge(df_comments, how='inner', left_on='name', right_on='link_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150446 entries, 0 to 150445
Data columns (total 21 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   author_x        150446 non-null  object 
 1   id_x            150446 non-null  object 
 2   name_x          150446 non-null  object 
 3   created_utc_x   150446 non-null  int64  
 4   subreddit_id_x  150446 non-null  object 
 5   permalink       150446 non-null  object 
 6   title           150446 non-null  object 
 7   selftext        150446 non-null  object 
 8   url             150279 non-null  object 
 9   retrieved_on_x  121160 non-null  float64
 10  id_y            150446 non-null  object 
 11  created_utc_y   150446 non-null  int64  
 12  subreddit       150446 non-null  object 
 13  link_id         150446 non-null  object 
 14  name_y          148823 non-null  object 
 15  subreddit_id_y  150446 non-null  object 
 16  parent_id       150446 non-null  object 
 17  retrieved_

In [7]:
final_df = df.copy()

# Convert timestamp columns to datetime format
timestamp_columns = ['created_utc_x', 'created_utc_y']
for col in timestamp_columns:
    final_df[col] = pd.to_datetime(final_df[col], unit='s')
    
final_df.head()    

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,retrieved_on_x,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,retrieved_on_y,author_y,body,score
0,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,,ggrglkj,2020-12-23 03:27:24,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t3_kii7hl,1619613000.0,SupremeLeader-Snoke,Did James Buchanan do anything good? like even the worst of the worst like Johnson got us Alaska and Wilson got Women's rights. did we gain anything from Buchanan?,1
1,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,,ggrgtto,2020-12-23 03:29:41,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t1_ggrglkj,1619613000.0,Sokol84,"Well, uh, he fucked up our country so bad that we abolished slavery early?",2
2,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,,gh2clij,2020-12-26 11:21:50,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t3_kii7hl,1619796000.0,MichaelTheKing7,"Buchanan did one good thing, he invented the concept of gays being presidents. For that alone, he is the best president",2
3,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,,gh527b0,2020-12-27 04:47:46,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t3_kii7hl,1619842000.0,Leather-Trainer,Idk if he was gay. For some reason historians think if you didn’t date a girl you were gay,1
4,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,,gh555wv,2020-12-27 05:10:54,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t1_gh527b0,1619843000.0,Sokol84,"No, apparently they actually have letters from him saying that he tried to get with guys but nobody else was gay in the white house.",1


In [8]:
final_df = final_df.drop(['retrieved_on_x', 'retrieved_on_y'], axis=1)

final_df.head()

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,author_y,body,score
0,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,ggrglkj,2020-12-23 03:27:24,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t3_kii7hl,SupremeLeader-Snoke,Did James Buchanan do anything good? like even the worst of the worst like Johnson got us Alaska and Wilson got Women's rights. did we gain anything from Buchanan?,1
1,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,ggrgtto,2020-12-23 03:29:41,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t1_ggrglkj,Sokol84,"Well, uh, he fucked up our country so bad that we abolished slavery early?",2
2,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,gh2clij,2020-12-26 11:21:50,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t3_kii7hl,MichaelTheKing7,"Buchanan did one good thing, he invented the concept of gays being presidents. For that alone, he is the best president",2
3,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,gh527b0,2020-12-27 04:47:46,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t3_kii7hl,Leather-Trainer,Idk if he was gay. For some reason historians think if you didn’t date a girl you were gay,1
4,Sokol84,kii7hl,t3_kii7hl,2020-12-23 00:42:00,t5_3ls3yv,/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,https://www.reddit.com/r/Presidentialpoll/comments/kii7hl/rpresidentialpoll_lounge/,gh555wv,2020-12-27 05:10:54,Presidentialpoll,t3_kii7hl,,t5_3ls3yv,t1_gh527b0,Sokol84,"No, apparently they actually have letters from him saying that he tried to get with guys but nobody else was gay in the white house.",1


In [9]:
final_df = final_df.drop(['author_x', 'id_x', 'name_x', 'subreddit_id_x',
                          'permalink', 'url', 'id_y', 'name_y', 'subreddit_id_y',
                          'parent_id', 'author_y'], axis=1)


In [10]:
final_df.columns

Index(['created_utc_x', 'title', 'selftext', 'created_utc_y', 'subreddit',
       'link_id', 'body', 'score'],
      dtype='object')

In [11]:
import re
import pandas as pd

# Define the dictionary mapping variations to the standard form
name_variations = {
  r'\b(?:george\s+w\.\s+bush|bush|george(?:\s+w\.\s+bush)?)\b': 'George W. Bush',
  r'\b(?:barack\s+obama|obama|barack(?:\s+obama)?)\b': 'Barack Obama',
  r'\b(?:bill\s+clinton|clinton|bill(?:\s+clinton)?)\b': 'Bill Clinton',
  r'\b(?:donald\s+trump|trump|donald(?:\s+trump)?)\b': 'Donald Trump',
  r'\b(?:joe\s+biden|biden|joe(?:\s+biden)?)\b': 'Joe Biden',  
}


# Function to replace variations with standard names
def standardize_names(text):
    for pattern, standard_name in name_variations.items():
        text = re.sub(pattern, standard_name, text, flags=re.IGNORECASE)
    return text


In [12]:
# Import necessary libraries
import spacy
from spacy import displacy
import pandas as pd
import re
from textblob import TextBlob

# Download the small English model for spaCy
spacy.cli.download("en_core_web_sm")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract persons (already standardized)
def extract_persons(text):
    # Combine all standardized names into one pattern
    pattern = r'\b(?:' + '|'.join([re.escape(name) for name in name_variations.values()]) + r')\b'
    # Find all matches of persons' names in the text
    persons_found = re.findall(pattern, text, flags=re.IGNORECASE)
    # Return unique persons' names
    return list(set(persons_found))



# Apply the extract_persons function to 'title', 'selftext', and 'body' columns
final_df['persons_title'] = final_df['title'].apply(extract_persons)
final_df['persons_selftext'] = final_df['selftext'].apply(extract_persons)
final_df['persons_body'] = final_df['body'].apply(extract_persons)

# Filter the DataFrame to include only rows where either 'persons_title' or 'persons_selftext' or 'persons_body' is not empty
filtered_df = final_df[(final_df['persons_title'].apply(len) > 0) | 
                       (final_df['persons_selftext'].apply(len) > 0) |
                       (final_df['persons_body'].apply(len) > 0)]

# # Compute sentiments for 'title', 'selftext', and 'body'
# filtered_df['sentiment_title'] = filtered_df['title'].apply(get_sentiment)
# filtered_df['sentiment_selftext'] = filtered_df['selftext'].apply(get_sentiment)
# filtered_df['sentiment_body'] = filtered_df['body'].apply(get_sentiment)

# Display the filtered DataFrame with sentiments
filtered_df.head()


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,created_utc_x,title,selftext,created_utc_y,subreddit,link_id,body,score,persons_title,persons_selftext,persons_body
14,2020-12-23 00:42:00,r/Presidentialpoll Lounge,A place for members of r/Presidentialpoll to chat with each other,2021-01-22 00:20:19,Presidentialpoll,t3_kii7hl,I literally just realized that Joe Biden has been the only president born during FDR’s presidency.,1,[],[],[Joe Biden]
142,2020-12-26 11:07:23,"campaign 2: all of my promises and desires to work, as well as other stances.","Hello everyone, as you might know an election is coming soon in this subreddit and so with time passing quickly, I decided to make another post about what I plan to do. I hope everyone is doing alright and without drawing out the beginning, let's get started.\n\nNOTE: every move that I make if I become a moderator will be discussed with the person who created the sub and good friend of mine, u/Sokol84 so we can be sure if we can add these ideas. We will also ask the community about our ideas and plans so we know for sure what works and what doesn't. I will take the position seriously (if I win obviously). The following is simply what I want to do.\n\n1. **New Amendments**\n\nIf you don't know what I am talking about, in the description of the subreddit you will see: ,, Amendment 1: Other types of posts are allowed''. These amendments are basically rules and so, I think we need a few more to show by comers what this sub allows and doesn't allow. I think we need the basics first:\n\n*Amendment 2: no personal attacks*\n\nThis is very important and something that needs to be established. There are loads of people who are very toxic and their apperence here can ruin the reputation of the sub. If toxic people attack others people will talk about their opinions less and polls would be much more jarring. People will leave and no one would join and the sub would come to a close which would be sad because it has a lot of potential. This rule is also in r/Presidents and there are no problems there. I am not saying people are toxic, everyone here is chill and nice, I am talking about newcomers who might ruin the sub because of their unwill to respect others and blatant political positions that make others look bad.\n\n*Amendment 3: reposts are not allowed*\n\nAnother big deal in my opinion. Reposts can ruin a sub's reputation quickly and it takes a while to come back. Let's see why they are bad:\n\n\-One person posts a poll about the election of 1960\n\n\-He then proceeds to post it again 4x\n\n\-People will not vote on the extras\n\n\-Less discussion\n\n\-Less members joining\n\n\-Less posts\n\n\-Negative view for people who planned to join\n\nSee how a minor thing can destroy everything. Now I have to say this loud and clear: posting a lot isn't the issue. If you post a poll about the best president, then a election, then a custom election, then who is the prettiest... That is not bad. It gives variety to the sub it overall lands as a positive, but posting the exact same poll over and over again with no difference is not good.\n\n2. **What about other content that aren't polls?**\n\nI have an answer to that question, they are harmless. If the post is harmless aka it doesn't attack anyone and it brings some new conversation I don't mind. I do have to say, I will only not allow a few types of posts: memes, shitposts and pictures. These don't fit the sub. Memes and shitposts have an entire subreddit of their own so I believe it is pointless for them to be here. Pictures are posts with a picture as the post and most of the time a bit of information about it in the title. These posts are most often seen in r/USPresidentialHistory and so again, it brings nothing new to the table. Also, if many posts that aren't polls starting appearing constantly, we will have to restrict them.\n\n3. **Posts about Vice Presidents**\n\nI think that posts about VPs are allowed for a few reasons:\n\n1. it can bring new and interesting information and discussion to the table\n2. Most people know more about the president so these posts would not affect the sub at all\n3. This can be an chance for people to mix in both the presidents and the ViPs. For example: ,,Who do you believe is the better duo:\n\n\-George H.W. Bush/Dan Qyuale\n\n\-George W. Bush/Dick Cheney''\n\nObviously the example I used is shit because the first option would get all the votes, but you know what I mean. I think there are many ways this can be taken and that will make the sub rise. \n\n4. **What you should not do during these monthly elections**\n\nThere are things that I need to point out because if I do not, people will do whatever they want:\n\n1. Do not attack the other candidate or spread false information about them (if you dare do this, I will remove all of your comments and perma-ban you, that's how serious I am and this isn't a joke)\n2. Do not rig the election by voting with your alt accounts or getting you friends to join just so you or someone else can win. If you are caught, you and all of your other accounts will be banned.\n3. Be respectful if your candidate loses and don't make assumption immediatly for example, don't say the other person rigged the election unless there is actual evidence or valid suspition.\n\n I hope everyone understands this and doesn't do any of this...\n\nAnd that is it, I covered everything I wanted. I wish the best to Snoke and everyone else who wants to join the election. I hope no one will cheat and everything will be fair game. See you soon lovely readers and have a nice day.\n\n[My profile pic, which is why I am putting this here](https://preview.redd.it/sdc4k2blhi761.png?width=1266&amp;format=png&amp;auto=webp&amp;s=e580d78022c367010881cda858b0db9988956535)",2020-12-26 11:08:38,Presidentialpoll,t3_kkgy5t,*Whom* do you believe,-1,[],[George W. Bush],[]
143,2020-12-26 11:07:23,"campaign 2: all of my promises and desires to work, as well as other stances.","Hello everyone, as you might know an election is coming soon in this subreddit and so with time passing quickly, I decided to make another post about what I plan to do. I hope everyone is doing alright and without drawing out the beginning, let's get started.\n\nNOTE: every move that I make if I become a moderator will be discussed with the person who created the sub and good friend of mine, u/Sokol84 so we can be sure if we can add these ideas. We will also ask the community about our ideas and plans so we know for sure what works and what doesn't. I will take the position seriously (if I win obviously). The following is simply what I want to do.\n\n1. **New Amendments**\n\nIf you don't know what I am talking about, in the description of the subreddit you will see: ,, Amendment 1: Other types of posts are allowed''. These amendments are basically rules and so, I think we need a few more to show by comers what this sub allows and doesn't allow. I think we need the basics first:\n\n*Amendment 2: no personal attacks*\n\nThis is very important and something that needs to be established. There are loads of people who are very toxic and their apperence here can ruin the reputation of the sub. If toxic people attack others people will talk about their opinions less and polls would be much more jarring. People will leave and no one would join and the sub would come to a close which would be sad because it has a lot of potential. This rule is also in r/Presidents and there are no problems there. I am not saying people are toxic, everyone here is chill and nice, I am talking about newcomers who might ruin the sub because of their unwill to respect others and blatant political positions that make others look bad.\n\n*Amendment 3: reposts are not allowed*\n\nAnother big deal in my opinion. Reposts can ruin a sub's reputation quickly and it takes a while to come back. Let's see why they are bad:\n\n\-One person posts a poll about the election of 1960\n\n\-He then proceeds to post it again 4x\n\n\-People will not vote on the extras\n\n\-Less discussion\n\n\-Less members joining\n\n\-Less posts\n\n\-Negative view for people who planned to join\n\nSee how a minor thing can destroy everything. Now I have to say this loud and clear: posting a lot isn't the issue. If you post a poll about the best president, then a election, then a custom election, then who is the prettiest... That is not bad. It gives variety to the sub it overall lands as a positive, but posting the exact same poll over and over again with no difference is not good.\n\n2. **What about other content that aren't polls?**\n\nI have an answer to that question, they are harmless. If the post is harmless aka it doesn't attack anyone and it brings some new conversation I don't mind. I do have to say, I will only not allow a few types of posts: memes, shitposts and pictures. These don't fit the sub. Memes and shitposts have an entire subreddit of their own so I believe it is pointless for them to be here. Pictures are posts with a picture as the post and most of the time a bit of information about it in the title. These posts are most often seen in r/USPresidentialHistory and so again, it brings nothing new to the table. Also, if many posts that aren't polls starting appearing constantly, we will have to restrict them.\n\n3. **Posts about Vice Presidents**\n\nI think that posts about VPs are allowed for a few reasons:\n\n1. it can bring new and interesting information and discussion to the table\n2. Most people know more about the president so these posts would not affect the sub at all\n3. This can be an chance for people to mix in both the presidents and the ViPs. For example: ,,Who do you believe is the better duo:\n\n\-George H.W. Bush/Dan Qyuale\n\n\-George W. Bush/Dick Cheney''\n\nObviously the example I used is shit because the first option would get all the votes, but you know what I mean. I think there are many ways this can be taken and that will make the sub rise. \n\n4. **What you should not do during these monthly elections**\n\nThere are things that I need to point out because if I do not, people will do whatever they want:\n\n1. Do not attack the other candidate or spread false information about them (if you dare do this, I will remove all of your comments and perma-ban you, that's how serious I am and this isn't a joke)\n2. Do not rig the election by voting with your alt accounts or getting you friends to join just so you or someone else can win. If you are caught, you and all of your other accounts will be banned.\n3. Be respectful if your candidate loses and don't make assumption immediatly for example, don't say the other person rigged the election unless there is actual evidence or valid suspition.\n\n I hope everyone understands this and doesn't do any of this...\n\nAnd that is it, I covered everything I wanted. I wish the best to Snoke and everyone else who wants to join the election. I hope no one will cheat and everything will be fair game. See you soon lovely readers and have a nice day.\n\n[My profile pic, which is why I am putting this here](https://preview.redd.it/sdc4k2blhi761.png?width=1266&amp;format=png&amp;auto=webp&amp;s=e580d78022c367010881cda858b0db9988956535)",2020-12-26 11:10:01,Presidentialpoll,t3_kkgy5t,ok nerd,3,[],[George W. Bush],[]
144,2020-12-26 11:07:23,"campaign 2: all of my promises and desires to work, as well as other stances.","Hello everyone, as you might know an election is coming soon in this subreddit and so with time passing quickly, I decided to make another post about what I plan to do. I hope everyone is doing alright and without drawing out the beginning, let's get started.\n\nNOTE: every move that I make if I become a moderator will be discussed with the person who created the sub and good friend of mine, u/Sokol84 so we can be sure if we can add these ideas. We will also ask the community about our ideas and plans so we know for sure what works and what doesn't. I will take the position seriously (if I win obviously). The following is simply what I want to do.\n\n1. **New Amendments**\n\nIf you don't know what I am talking about, in the description of the subreddit you will see: ,, Amendment 1: Other types of posts are allowed''. These amendments are basically rules and so, I think we need a few more to show by comers what this sub allows and doesn't allow. I think we need the basics first:\n\n*Amendment 2: no personal attacks*\n\nThis is very important and something that needs to be established. There are loads of people who are very toxic and their apperence here can ruin the reputation of the sub. If toxic people attack others people will talk about their opinions less and polls would be much more jarring. People will leave and no one would join and the sub would come to a close which would be sad because it has a lot of potential. This rule is also in r/Presidents and there are no problems there. I am not saying people are toxic, everyone here is chill and nice, I am talking about newcomers who might ruin the sub because of their unwill to respect others and blatant political positions that make others look bad.\n\n*Amendment 3: reposts are not allowed*\n\nAnother big deal in my opinion. Reposts can ruin a sub's reputation quickly and it takes a while to come back. Let's see why they are bad:\n\n\-One person posts a poll about the election of 1960\n\n\-He then proceeds to post it again 4x\n\n\-People will not vote on the extras\n\n\-Less discussion\n\n\-Less members joining\n\n\-Less posts\n\n\-Negative view for people who planned to join\n\nSee how a minor thing can destroy everything. Now I have to say this loud and clear: posting a lot isn't the issue. If you post a poll about the best president, then a election, then a custom election, then who is the prettiest... That is not bad. It gives variety to the sub it overall lands as a positive, but posting the exact same poll over and over again with no difference is not good.\n\n2. **What about other content that aren't polls?**\n\nI have an answer to that question, they are harmless. If the post is harmless aka it doesn't attack anyone and it brings some new conversation I don't mind. I do have to say, I will only not allow a few types of posts: memes, shitposts and pictures. These don't fit the sub. Memes and shitposts have an entire subreddit of their own so I believe it is pointless for them to be here. Pictures are posts with a picture as the post and most of the time a bit of information about it in the title. These posts are most often seen in r/USPresidentialHistory and so again, it brings nothing new to the table. Also, if many posts that aren't polls starting appearing constantly, we will have to restrict them.\n\n3. **Posts about Vice Presidents**\n\nI think that posts about VPs are allowed for a few reasons:\n\n1. it can bring new and interesting information and discussion to the table\n2. Most people know more about the president so these posts would not affect the sub at all\n3. This can be an chance for people to mix in both the presidents and the ViPs. For example: ,,Who do you believe is the better duo:\n\n\-George H.W. Bush/Dan Qyuale\n\n\-George W. Bush/Dick Cheney''\n\nObviously the example I used is shit because the first option would get all the votes, but you know what I mean. I think there are many ways this can be taken and that will make the sub rise. \n\n4. **What you should not do during these monthly elections**\n\nThere are things that I need to point out because if I do not, people will do whatever they want:\n\n1. Do not attack the other candidate or spread false information about them (if you dare do this, I will remove all of your comments and perma-ban you, that's how serious I am and this isn't a joke)\n2. Do not rig the election by voting with your alt accounts or getting you friends to join just so you or someone else can win. If you are caught, you and all of your other accounts will be banned.\n3. Be respectful if your candidate loses and don't make assumption immediatly for example, don't say the other person rigged the election unless there is actual evidence or valid suspition.\n\n I hope everyone understands this and doesn't do any of this...\n\nAnd that is it, I covered everything I wanted. I wish the best to Snoke and everyone else who wants to join the election. I hope no one will cheat and everything will be fair game. See you soon lovely readers and have a nice day.\n\n[My profile pic, which is why I am putting this here](https://preview.redd.it/sdc4k2blhi761.png?width=1266&amp;format=png&amp;auto=webp&amp;s=e580d78022c367010881cda858b0db9988956535)",2020-12-26 15:04:43,Presidentialpoll,t3_kkgy5t,Maybe I should add an amendment banning grammar bots lol.,3,[],[George W. Bush],[]
145,2020-12-26 11:07:23,"campaign 2: all of my promises and desires to work, as well as other stances.","Hello everyone, as you might know an election is coming soon in this subreddit and so with time passing quickly, I decided to make another post about what I plan to do. I hope everyone is doing alright and without drawing out the beginning, let's get started.\n\nNOTE: every move that I make if I become a moderator will be discussed with the person who created the sub and good friend of mine, u/Sokol84 so we can be sure if we can add these ideas. We will also ask the community about our ideas and plans so we know for sure what works and what doesn't. I will take the position seriously (if I win obviously). The following is simply what I want to do.\n\n1. **New Amendments**\n\nIf you don't know what I am talking about, in the description of the subreddit you will see: ,, Amendment 1: Other types of posts are allowed''. These amendments are basically rules and so, I think we need a few more to show by comers what this sub allows and doesn't allow. I think we need the basics first:\n\n*Amendment 2: no personal attacks*\n\nThis is very important and something that needs to be established. There are loads of people who are very toxic and their apperence here can ruin the reputation of the sub. If toxic people attack others people will talk about their opinions less and polls would be much more jarring. People will leave and no one would join and the sub would come to a close which would be sad because it has a lot of potential. This rule is also in r/Presidents and there are no problems there. I am not saying people are toxic, everyone here is chill and nice, I am talking about newcomers who might ruin the sub because of their unwill to respect others and blatant political positions that make others look bad.\n\n*Amendment 3: reposts are not allowed*\n\nAnother big deal in my opinion. Reposts can ruin a sub's reputation quickly and it takes a while to come back. Let's see why they are bad:\n\n\-One person posts a poll about the election of 1960\n\n\-He then proceeds to post it again 4x\n\n\-People will not vote on the extras\n\n\-Less discussion\n\n\-Less members joining\n\n\-Less posts\n\n\-Negative view for people who planned to join\n\nSee how a minor thing can destroy everything. Now I have to say this loud and clear: posting a lot isn't the issue. If you post a poll about the best president, then a election, then a custom election, then who is the prettiest... That is not bad. It gives variety to the sub it overall lands as a positive, but posting the exact same poll over and over again with no difference is not good.\n\n2. **What about other content that aren't polls?**\n\nI have an answer to that question, they are harmless. If the post is harmless aka it doesn't attack anyone and it brings some new conversation I don't mind. I do have to say, I will only not allow a few types of posts: memes, shitposts and pictures. These don't fit the sub. Memes and shitposts have an entire subreddit of their own so I believe it is pointless for them to be here. Pictures are posts with a picture as the post and most of the time a bit of information about it in the title. These posts are most often seen in r/USPresidentialHistory and so again, it brings nothing new to the table. Also, if many posts that aren't polls starting appearing constantly, we will have to restrict them.\n\n3. **Posts about Vice Presidents**\n\nI think that posts about VPs are allowed for a few reasons:\n\n1. it can bring new and interesting information and discussion to the table\n2. Most people know more about the president so these posts would not affect the sub at all\n3. This can be an chance for people to mix in both the presidents and the ViPs. For example: ,,Who do you believe is the better duo:\n\n\-George H.W. Bush/Dan Qyuale\n\n\-George W. Bush/Dick Cheney''\n\nObviously the example I used is shit because the first option would get all the votes, but you know what I mean. I think there are many ways this can be taken and that will make the sub rise. \n\n4. **What you should not do during these monthly elections**\n\nThere are things that I need to point out because if I do not, people will do whatever they want:\n\n1. Do not attack the other candidate or spread false information about them (if you dare do this, I will remove all of your comments and perma-ban you, that's how serious I am and this isn't a joke)\n2. Do not rig the election by voting with your alt accounts or getting you friends to join just so you or someone else can win. If you are caught, you and all of your other accounts will be banned.\n3. Be respectful if your candidate loses and don't make assumption immediatly for example, don't say the other person rigged the election unless there is actual evidence or valid suspition.\n\n I hope everyone understands this and doesn't do any of this...\n\nAnd that is it, I covered everything I wanted. I wish the best to Snoke and everyone else who wants to join the election. I hope no one will cheat and everything will be fair game. See you soon lovely readers and have a nice day.\n\n[My profile pic, which is why I am putting this here](https://preview.redd.it/sdc4k2blhi761.png?width=1266&amp;format=png&amp;auto=webp&amp;s=e580d78022c367010881cda858b0db9988956535)",2020-12-26 15:11:17,Presidentialpoll,t3_kkgy5t,"Both good amendments. For amendment 3, I would probably limit reposting to somewhere between a week gap and a month gap.",2,[],[George W. Bush],[]


In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4461 entries, 14 to 149850
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_utc_x     4461 non-null   datetime64[ns]
 1   title             4461 non-null   object        
 2   selftext          4461 non-null   object        
 3   created_utc_y     4461 non-null   datetime64[ns]
 4   subreddit         4461 non-null   object        
 5   link_id           4461 non-null   object        
 6   body              4461 non-null   object        
 7   score             4461 non-null   int64         
 8   persons_title     4461 non-null   object        
 9   persons_selftext  4461 non-null   object        
 10  persons_body      4461 non-null   object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 418.2+ KB


In [14]:
filtered_df.to_csv('C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\Presidentialpoll_ner.csv')