# importing Data and Loading Submission File

In [1]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\uspolitics_submissions'

# Call the function to read the zst folder into a DataFra
df_submissions = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


# Now you can work with the DataFrame (df)
df_submissions = df_submissions[['author', 'id','name', 'created_utc', 'subreddit_id', 'permalink', 'title', 'selftext', 'url', 'retrieved_on']]
df_submissions.head()

  from pandas.core import (


Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
0,nikdahl,8d0ox,,1239904057,t5_2qwlq,/r/uspolitics/comments/8d0ox/schuster_tea_bagging_puns_nuts_whipped_out/,"Schuster: Tea bagging puns (""nuts"", ""whipped out"", ""toothless"", ""full throated"", ""tongue lashing"", ""lick"", ""under"", ""nutshell"", ""tight lipped"", ""taste"" and ""you're gonna need a Dick Armey"") [VID]",,http://www.youtube.com/watch?v=8i-OWDjOQfI,1522790000.0
1,scientologist2,8d5ns,,1239945414,t5_2qwlq,/r/uspolitics/comments/8d5ns/citing_real_estate_market_conditions_port/,"Citing real estate market conditions, Port Authority of New York proposes indefinitely putting off building of WTC towers",,http://www.forbes.com/feeds/ap/2009/04/16/ap6298639.html,1522790000.0
2,[deleted],8d7u9,,1239967209,t5_2qwlq,/r/uspolitics/comments/8d7u9/stiglitz_says_white_house_ties_to_wall_street/,Stiglitz Says White House Ties to Wall Street Doom Bank Rescue - Bloomberg.com,[deleted],http://bloomberg.com/apps/news?pid=20601087&amp;sid=ahnPchOxZMh8&amp;refer=home,1522790000.0
3,davega7,8d99h,,1239978426,t5_2qwlq,/r/uspolitics/comments/8d99h/hillary_clinton_needs_moneypimps_out_bill/,Hillary Clinton needs money;pimps out Bill,,http://news.bbc.co.uk/2/hi/americas/8004069.stm,1522790000.0
4,scientologist2,8dckb,,1239998550,t5_2qwlq,/r/uspolitics/comments/8dckb/a_texas_secession_is_actually_only_a_meaningful/,"A Texas secession is actually only a meaningful threat to the Republican party, as the absence of Texas would put them in permanent minority status.",,http://www.fivethirtyeight.com/2009/04/hey-rick-can-we-talk.html,1522790000.0


In [2]:
df_submissions.tail()

Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
115023,stankmanly,zzvgk2,t3_zzvgk2,1672497495,t5_2qwlq,/r/uspolitics/comments/zzvgk2/former_doj_officials_are_asked_under_oath_if_any/,Former DOJ Officials Are Asked Under Oath If Any of Trump's Election Fraud Claims Were Found Credible: 'No.',,https://people.com/politics/former-doj-officials-testify-about-pressure-to-find-election-fraud-after-2020-election,1673168000.0
115024,shallah,zzw3wi,t3_zzw3wi,1672499416,t5_2qwlq,/r/uspolitics/comments/zzw3wi/epa_issues_clean_water_rule_that_repeals_trump/,EPA issues clean water rule that repeals Trump administration changes,,https://www.cnbc.com/2022/12/30/epa-issues-clean-water-rule-that-repeals-trump-administration-changes.html,1673168000.0
115025,Mud_666,zzz9qf,t3_zzz9qf,1672508225,t5_2qwlq,/r/uspolitics/comments/zzz9qf/washington_blames_record_migration_on_communism/,Washington Blames Record Migration on ‘Communism’ When the Causes Are Closer to Home – Orinoco Tribune,,https://orinocotribune.com/washington-blames-record-migration-on-communism-when-the-causes-are-closer-to-home/,1673168000.0
115026,factotum4stu,1001wak,t3_1001wak,1672515630,t5_2qwlq,/r/uspolitics/comments/1001wak/jobless_claims_rose_slightly_last_week/,Jobless Claims Rose Slightly Last Week,,https://www.usnews.com/news/economy/articles/2022-12-29/jobless-claims-rose-slightly-last-week,1673168000.0
115027,Weebertarian1,1003c1x,t3_1003c1x,1672519780,t5_2qwlq,/r/uspolitics/comments/1003c1x/the_trump_challengers_who_could_fight_him_for_the/,The Trump challengers who could fight him for the Republican nomination,,https://www.newsweek.com/donald-trump-challengers-who-could-fight-him-republican-nomination-1770544,1673168000.0


In [3]:
df_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115028 entries, 0 to 115027
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        115028 non-null  object 
 1   id            115028 non-null  object 
 2   name          75405 non-null   object 
 3   created_utc   115028 non-null  object 
 4   subreddit_id  115028 non-null  object 
 5   permalink     115028 non-null  object 
 6   title         115028 non-null  object 
 7   selftext      115028 non-null  object 
 8   url           114902 non-null  object 
 9   retrieved_on  93739 non-null   float64
dtypes: float64(1), object(9)
memory usage: 8.8+ MB


# Importing Data and Loading Comment File

In [4]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\uspolitics_comments'
# Call the function to read the zst folder into a DataFrame
df_comments = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# Now you can work with the DataFrame (df)
# df_comments.head()
df_comments = df_comments[['id', 'created_utc', 'subreddit', 'link_id', 'name', 'subreddit_id',
       'parent_id', 'retrieved_on', 'author', 'body', 'score']]
df_comments.head()


Unnamed: 0,id,created_utc,subreddit,link_id,name,subreddit_id,parent_id,retrieved_on,author,body,score
0,c08wh9z,1239908626,uspolitics,t3_8d0ox,t1_c08wh9z,t5_2qwlq,t3_8d0ox,1425954000.0,FokkeNews,"What I hate is all those subs in which you can't comment. Personally, I just vote them down unless it's a particularly good story.\n\nHow is this social news if you can't comment?",1
1,c08x3vy,1239978762,uspolitics,t3_8d99h,t1_c08x3vy,t5_2qwlq,t3_8d99h,1425955000.0,klonkk,oh my God ...,2
2,c0hlcb3,1261509829,uspolitics,t3_ahi7n,t1_c0hlcb3,t5_2qwlq,t3_ahi7n,1426164000.0,EarBucket,"Wait, I can't tell if I'm supposed to be excited or outraged about this.",1
3,c0hmy39,1261585639,uspolitics,t3_ahi7n,t1_c0hmy39,t5_2qwlq,t1_c0hlcb3,1426165000.0,testu_nagouchi,Giving anyone immunity from the law is generally a hugely bad idea.,1
4,c0hnndr,1261609488,uspolitics,t3_ahi7n,t1_c0hnndr,t5_2qwlq,t1_c0hmy39,1426165000.0,EarBucket,"That's my point; there's no context here to understand what we're supposed to be excited/outraged about. What's this executive order do, specifically?",1


In [5]:
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235329 entries, 0 to 235328
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            235329 non-null  object 
 1   created_utc   235329 non-null  object 
 2   subreddit     235329 non-null  object 
 3   link_id       235329 non-null  object 
 4   name          118402 non-null  object 
 5   subreddit_id  235329 non-null  object 
 6   parent_id     235329 non-null  object 
 7   retrieved_on  216630 non-null  float64
 8   author        235329 non-null  object 
 9   body          235329 non-null  object 
 10  score         235329 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 19.7+ MB


In [6]:
# Merge final_subs_df and final_comments_df on "name" and "link_id"
df = df_submissions.merge(df_comments, how='inner', left_on='name', right_on='link_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165091 entries, 0 to 165090
Data columns (total 21 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   author_x        165091 non-null  object 
 1   id_x            165091 non-null  object 
 2   name_x          165091 non-null  object 
 3   created_utc_x   165091 non-null  object 
 4   subreddit_id_x  165091 non-null  object 
 5   permalink       165091 non-null  object 
 6   title           165091 non-null  object 
 7   selftext        165091 non-null  object 
 8   url             164922 non-null  object 
 9   retrieved_on_x  107797 non-null  float64
 10  id_y            165091 non-null  object 
 11  created_utc_y   165091 non-null  object 
 12  subreddit       165091 non-null  object 
 13  link_id         165091 non-null  object 
 14  name_y          114390 non-null  object 
 15  subreddit_id_y  165091 non-null  object 
 16  parent_id       165091 non-null  object 
 17  retrieved_

In [7]:
final_df = df.copy()

# Convert timestamp columns to datetime format
timestamp_columns = ['created_utc_x', 'created_utc_y']
for col in timestamp_columns:
    final_df[col] = pd.to_datetime(final_df[col], unit='s')
    
final_df.head()    

  final_df[col] = pd.to_datetime(final_df[col], unit='s')
  final_df[col] = pd.to_datetime(final_df[col], unit='s')


Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,retrieved_on_x,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,retrieved_on_y,author_y,body,score
0,tonybeme,ft828,t3_ft828,2011-02-26 16:33:55,t5_2qwlq,/r/uspolitics/comments/ft828/gates_warns_against_wars_like_iraq_and/,Gates Warns Against Wars Like Iraq and Afghanistan - Truthdig,,http://www.truthdig.com/eartotheground/item/gates_warns_against_wars_like_iraq_and_afghanistan_20110225/,,c1ift4a,2011-02-26 16:58:38,uspolitics,t3_ft828,t1_c1ift4a,t5_2qwlq,t3_ft828,1426946000.0,[deleted],"&gt; You fool! You fell victim to one of the classic blunders - The most famous of which is ""never get involved in a land war in Asia""...",1
1,tonybeme,ft828,t3_ft828,2011-02-26 16:33:55,t5_2qwlq,/r/uspolitics/comments/ft828/gates_warns_against_wars_like_iraq_and/,Gates Warns Against Wars Like Iraq and Afghanistan - Truthdig,,http://www.truthdig.com/eartotheground/item/gates_warns_against_wars_like_iraq_and_afghanistan_20110225/,,c1pb7mc,2011-04-14 02:26:10,uspolitics,t3_ft828,t1_c1pb7mc,t5_2qwlq,t3_ft828,1427065000.0,TheTruthHurtsU,He forgot to say Libya,1
2,tonybeme,fzo4e,t3_fzo4e,2011-03-08 10:05:03,t5_2qwlq,/r/uspolitics/comments/fzo4e/walker_wont_meet_dems_at_the_border_truthdig/,Walker Won’t Meet Dems at the Border - Truthdig,,http://www.truthdig.com/eartotheground/item/walker_wont_meet_dems_at_the_border_20110307/,,c1k05j1,2011-03-09 15:48:31,uspolitics,t3_fzo4e,t1_c1k05j1,t5_2qwlq,t3_fzo4e,1426973000.0,Rixar13,"The posse of Wisconsin state senators who left the building—not to mention Wisconsin—last month to thwart Gov. Scott Walker’s campaign to quash state employees’ collective bargaining powers proposed a meeting with the governor somewhere by the Illinois-Wisconsin border, but Walker called the missing Democrats’ idea “ridiculous” and refused to budge Monday. —KA\r\n\r\n",1
3,sanity,fzhzh,t3_fzhzh,2011-03-08 03:37:19,t5_2qwlq,/r/uspolitics/comments/fzhzh/enforcing_a_nofly_zone_in_libya_would_be_an_act/,"Enforcing a no-fly zone in Libya would be an act of war against that country. I'm not saying that the US/NATO shouldn't do it, but are the people of the US ready to get into a war what wasn't even on the radar just a few weeks ago? This stuff is mind-boggling.",,http://www.reddit.com/r/uspolitics/comments/fzhzh/enforcing_a_nofly_zone_in_libya_would_be_an_act/,,c1pbzfg,2011-04-14 04:58:46,uspolitics,t3_fzhzh,t1_c1pbzfg,t5_2qwlq,t3_fzhzh,1427065000.0,bestbeforeMar91,What war? Obama said it would only last a few days. It's all over now.,1
4,tonybeme,g3s86,t3_g3s86,2011-03-14 17:18:48,t5_2qwlq,/r/uspolitics/comments/g3s86/state_dept_spokesman_quits_over_comments_truthdig/,State Dept. Spokesman Quits Over Comments - Truthdig,,http://www.truthdig.com/eartotheground/item/state_spokesman_resigns_over_comments_20110313/,,c1pc43q,2011-04-14 05:28:46,uspolitics,t3_g3s86,t1_c1pc43q,t5_2qwlq,t3_g3s86,1427066000.0,bestbeforeMar91,"Old news, just like ethics.",1


In [8]:
final_df = final_df.drop(['retrieved_on_x', 'retrieved_on_y'], axis=1)

final_df.head()

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,author_y,body,score
0,tonybeme,ft828,t3_ft828,2011-02-26 16:33:55,t5_2qwlq,/r/uspolitics/comments/ft828/gates_warns_against_wars_like_iraq_and/,Gates Warns Against Wars Like Iraq and Afghanistan - Truthdig,,http://www.truthdig.com/eartotheground/item/gates_warns_against_wars_like_iraq_and_afghanistan_20110225/,c1ift4a,2011-02-26 16:58:38,uspolitics,t3_ft828,t1_c1ift4a,t5_2qwlq,t3_ft828,[deleted],"&gt; You fool! You fell victim to one of the classic blunders - The most famous of which is ""never get involved in a land war in Asia""...",1
1,tonybeme,ft828,t3_ft828,2011-02-26 16:33:55,t5_2qwlq,/r/uspolitics/comments/ft828/gates_warns_against_wars_like_iraq_and/,Gates Warns Against Wars Like Iraq and Afghanistan - Truthdig,,http://www.truthdig.com/eartotheground/item/gates_warns_against_wars_like_iraq_and_afghanistan_20110225/,c1pb7mc,2011-04-14 02:26:10,uspolitics,t3_ft828,t1_c1pb7mc,t5_2qwlq,t3_ft828,TheTruthHurtsU,He forgot to say Libya,1
2,tonybeme,fzo4e,t3_fzo4e,2011-03-08 10:05:03,t5_2qwlq,/r/uspolitics/comments/fzo4e/walker_wont_meet_dems_at_the_border_truthdig/,Walker Won’t Meet Dems at the Border - Truthdig,,http://www.truthdig.com/eartotheground/item/walker_wont_meet_dems_at_the_border_20110307/,c1k05j1,2011-03-09 15:48:31,uspolitics,t3_fzo4e,t1_c1k05j1,t5_2qwlq,t3_fzo4e,Rixar13,"The posse of Wisconsin state senators who left the building—not to mention Wisconsin—last month to thwart Gov. Scott Walker’s campaign to quash state employees’ collective bargaining powers proposed a meeting with the governor somewhere by the Illinois-Wisconsin border, but Walker called the missing Democrats’ idea “ridiculous” and refused to budge Monday. —KA\r\n\r\n",1
3,sanity,fzhzh,t3_fzhzh,2011-03-08 03:37:19,t5_2qwlq,/r/uspolitics/comments/fzhzh/enforcing_a_nofly_zone_in_libya_would_be_an_act/,"Enforcing a no-fly zone in Libya would be an act of war against that country. I'm not saying that the US/NATO shouldn't do it, but are the people of the US ready to get into a war what wasn't even on the radar just a few weeks ago? This stuff is mind-boggling.",,http://www.reddit.com/r/uspolitics/comments/fzhzh/enforcing_a_nofly_zone_in_libya_would_be_an_act/,c1pbzfg,2011-04-14 04:58:46,uspolitics,t3_fzhzh,t1_c1pbzfg,t5_2qwlq,t3_fzhzh,bestbeforeMar91,What war? Obama said it would only last a few days. It's all over now.,1
4,tonybeme,g3s86,t3_g3s86,2011-03-14 17:18:48,t5_2qwlq,/r/uspolitics/comments/g3s86/state_dept_spokesman_quits_over_comments_truthdig/,State Dept. Spokesman Quits Over Comments - Truthdig,,http://www.truthdig.com/eartotheground/item/state_spokesman_resigns_over_comments_20110313/,c1pc43q,2011-04-14 05:28:46,uspolitics,t3_g3s86,t1_c1pc43q,t5_2qwlq,t3_g3s86,bestbeforeMar91,"Old news, just like ethics.",1


In [9]:
final_df = final_df.drop(['author_x', 'id_x', 'name_x', 'subreddit_id_x',
                          'permalink', 'url', 'id_y', 'name_y', 'subreddit_id_y',
                          'parent_id', 'author_y'], axis=1)


In [10]:
final_df.columns

Index(['created_utc_x', 'title', 'selftext', 'created_utc_y', 'subreddit',
       'link_id', 'body', 'score'],
      dtype='object')

In [11]:
import re
import pandas as pd

# Define the dictionary mapping variations to the standard form
name_variations = {
  r'\b(?:george\s+w\.\s+bush|bush|george(?:\s+w\.\s+bush)?)\b': 'George W. Bush',
  r'\b(?:barack\s+obama|obama|barack(?:\s+obama)?)\b': 'Barack Obama',
  r'\b(?:bill\s+clinton|clinton|bill(?:\s+clinton)?)\b': 'Bill Clinton',
  r'\b(?:donald\s+trump|trump|donald(?:\s+trump)?)\b': 'Donald Trump',
  r'\b(?:joe\s+biden|biden|joe(?:\s+biden)?)\b': 'Joe Biden',  
}


# Function to replace variations with standard names
def standardize_names(text):
    for pattern, standard_name in name_variations.items():
        text = re.sub(pattern, standard_name, text, flags=re.IGNORECASE)
    return text


In [12]:
# Import necessary libraries
import spacy
from spacy import displacy
import pandas as pd
import re
from textblob import TextBlob

# Download the small English model for spaCy
spacy.cli.download("en_core_web_sm")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract persons (already standardized)
def extract_persons(text):
    # Combine all standardized names into one pattern
    pattern = r'\b(?:' + '|'.join([re.escape(name) for name in name_variations.values()]) + r')\b'
    # Find all matches of persons' names in the text
    persons_found = re.findall(pattern, text, flags=re.IGNORECASE)
    # Return unique persons' names
    return list(set(persons_found))



# Apply the extract_persons function to 'title', 'selftext', and 'body' columns
final_df['persons_title'] = final_df['title'].apply(extract_persons)
final_df['persons_selftext'] = final_df['selftext'].apply(extract_persons)
final_df['persons_body'] = final_df['body'].apply(extract_persons)

# Filter the DataFrame to include only rows where either 'persons_title' or 'persons_selftext' or 'persons_body' is not empty
filtered_df = final_df[(final_df['persons_title'].apply(len) > 0) | 
                       (final_df['persons_selftext'].apply(len) > 0) |
                       (final_df['persons_body'].apply(len) > 0)]

# # Compute sentiments for 'title', 'selftext', and 'body'
# filtered_df['sentiment_title'] = filtered_df['title'].apply(get_sentiment)
# filtered_df['sentiment_selftext'] = filtered_df['selftext'].apply(get_sentiment)
# filtered_df['sentiment_body'] = filtered_df['body'].apply(get_sentiment)

# Display the filtered DataFrame with sentiments
filtered_df.head()


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,created_utc_x,title,selftext,created_utc_y,subreddit,link_id,body,score,persons_title,persons_selftext,persons_body
9,2011-04-12 19:09:48,Democratic senator wants Internet sales taxes,,2011-04-29 03:19:33,uspolitics,t3_gofnk,"Hey, he's willing to increase taxes. Bold move. Future Presidential Timber. etc etc.\n\nNever mind that he wants the most regressive tax possible assessed on the lowest income people. The people who buy over the net because they can't afford to pay local sales taxes.\n\nYou don't really think that he is going to propose sales taxes be paid on Donald Trump's next $50 million yacht built in Greece, do you?\n\nThe man is a politician, if his lips move he's lying.",1,[],[],[Donald Trump]
31,2011-04-27 20:46:59,"This 2007 Washington Post article is hilarious: ""Bush Budget Projects A Surplus by 2012""",,2011-04-28 02:00:40,uspolitics,t3_gypfc,john boehner is sooooo smart. \n\ni wonder if he and donald trump go tanning together.,1,[],[],[donald trump]
43,2011-05-08 22:53:26,"We might ask ourselves how we would be reacting if Iraqi commandos landed at George W. Bush’s compound, assassinated him, and dumped his body in the Atlantic.",,2011-05-08 23:20:49,uspolitics,t3_h6wdn,Chomsky is a backseat driver who commentates from the safety of a university office.,2,[George W. Bush],[],[]
44,2011-05-08 22:53:26,"We might ask ourselves how we would be reacting if Iraqi commandos landed at George W. Bush’s compound, assassinated him, and dumped his body in the Atlantic.",,2011-05-08 23:20:52,uspolitics,t3_h6wdn,I wouldn't like the airspace violation. But they can have Bush.,1,[George W. Bush],[],[]
45,2011-05-08 22:53:26,"We might ask ourselves how we would be reacting if Iraqi commandos landed at George W. Bush’s compound, assassinated him, and dumped his body in the Atlantic.",,2011-05-09 00:14:47,uspolitics,t3_h6wdn,\nThis being Reddit I suspect that would get a certain amount of upvotes...,0,[George W. Bush],[],[]


In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13584 entries, 9 to 165032
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_utc_x     13584 non-null  datetime64[ns]
 1   title             13584 non-null  object        
 2   selftext          13584 non-null  object        
 3   created_utc_y     13584 non-null  datetime64[ns]
 4   subreddit         13584 non-null  object        
 5   link_id           13584 non-null  object        
 6   body              13584 non-null  object        
 7   score             13584 non-null  int64         
 8   persons_title     13584 non-null  object        
 9   persons_selftext  13584 non-null  object        
 10  persons_body      13584 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 1.2+ MB


In [14]:
filtered_df.to_csv('C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\uspolitics_ner.csv')