# importing Data and Loading Submission File

In [1]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\AmericanPolitics_submissions'

# Call the function to read the zst folder into a DataFra
df_submissions = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


# Now you can work with the DataFrame (df)
df_submissions = df_submissions[['author', 'id','name', 'created_utc', 'subreddit_id', 'permalink', 'title', 'selftext', 'url', 'retrieved_on']]
df_submissions.head()

  from pandas.core import (


Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
0,BravoLima,6lndu,,1212360395,t5_2qied,/r/AmericanPolitics/comments/6lndu/the_security_and_prosperity_partnership_agreement/,The Security and Prosperity Partnership Agreement: NAFTA Plus Homeland Security,,http://www.infowars.com/?p=2487,1522694000.0
1,BravoLima,6lnfr,,1212362014,t5_2qied,/r/AmericanPolitics/comments/6lnfr/us_border_agents_lured_by_the_dark_side/,US Border Agents Lured by the Dark Side,,http://www.infowars.com/?p=2484,1522694000.0
2,[deleted],6lnjd,,1212364284,t5_2qied,/r/AmericanPolitics/comments/6lnjd/albuquerque_cop_attacks_news_videographer_bravely/,Albuquerque Cop Attacks News Videographer - 'Bravely' From Behind (video),,http://www.infowars.com/?p=2481,1522694000.0
3,BravoLima,6lnkd,,1212364877,t5_2qied,/r/AmericanPolitics/comments/6lnkd/rupert_faux_newz_murdoch_goes_after_condi_urges/,Rupert 'Faux Newz' Murdoch Goes After Condi &amp; Urges Blockade Against Iran,,http://www.infowars.com/?p=2485,1522694000.0
4,BravoLima,6lnkw,,1212365181,t5_2qied,/r/AmericanPolitics/comments/6lnkw/tom_brokaws_disturbing_defense_of_the_media_and/,Tom Brokaw's Disturbing Defense of the Media and Iraq,,http://www.infowars.com/?p=2478,1522694000.0


In [2]:
df_submissions.tail()

Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
58748,Mud_666,zzz9k9,t3_zzz9k9,1672508211,t5_2qied,/r/AmericanPolitics/comments/zzz9k9/washington_blames_record_migration_on_communism/,Washington Blames Record Migration on ‘Communism’ When the Causes Are Closer to Home – Orinoco Tribune,,https://orinocotribune.com/washington-blames-record-migration-on-communism-when-the-causes-are-closer-to-home/,1673168000.0
58749,factotum4stu,zzzyst,t3_zzzyst,1672510116,t5_2qied,/r/AmericanPolitics/comments/zzzyst/msnbcs_chris_hayes_blasted_for_shrugging_off/,MSNBC’s Chris Hayes blasted for shrugging off Biden’s lies while ripping Santos,,https://www.washingtontimes.com/news/2022/dec/28/msnbcs-chris-hayes-blasted-shrugging-bidens-lies-w/,1673168000.0
58750,Dismal-Ad-3066,1001ldy,t3_1001ldy,1672514747,t5_2qied,/r/AmericanPolitics/comments/1001ldy/happy_new_year_to_friends/,Happy New Year to friends,,https://youtube.com/watch?v=Onk8kBvzBgs&amp;feature=share,1673168000.0
58751,Dismal-Ad-3066,1001lpm,t3_1001lpm,1672514772,t5_2qied,/r/AmericanPolitics/comments/1001lpm/happy_new_year_to_friends/,Happy New Year to friends,,https://youtube.com/watch?v=Onk8kBvzBgs&amp;feature=share,1673168000.0
58752,Dismal-Ad-3066,1001m29,t3_1001m29,1672514803,t5_2qied,/r/AmericanPolitics/comments/1001m29/happy_new_year_to_friends/,Happy New Year to friends,,https://youtube.com/watch?v=Onk8kBvzBgs&amp;feature=share,1673168000.0


In [3]:
df_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58753 entries, 0 to 58752
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        58753 non-null  object 
 1   id            58753 non-null  object 
 2   name          33057 non-null  object 
 3   created_utc   58753 non-null  object 
 4   subreddit_id  58753 non-null  object 
 5   permalink     58753 non-null  object 
 6   title         58753 non-null  object 
 7   selftext      58753 non-null  object 
 8   url           58725 non-null  object 
 9   retrieved_on  46421 non-null  float64
dtypes: float64(1), object(9)
memory usage: 4.5+ MB


# Importing Data and Loading Comment File

In [4]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\AmericanPolitics_comments'
# Call the function to read the zst folder into a DataFrame
df_comments = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# Now you can work with the DataFrame (df)
# df_comments.head()
df_comments = df_comments[['id', 'created_utc', 'subreddit', 'link_id', 'name', 'subreddit_id',
       'parent_id', 'retrieved_on', 'author', 'body', 'score']]
df_comments.head()


Unnamed: 0,id,created_utc,subreddit,link_id,name,subreddit_id,parent_id,retrieved_on,author,body,score
0,c047om0,1212441799,AmericanPolitics,t3_6lsfk,t1_c047om0,t5_2qied,t3_6lsfk,1425855000.0,BravoLima,"Please don't dance, Karl!",1
1,c0480kk,1212512013,AmericanPolitics,t3_6lsfk,t1_c0480kk,t5_2qied,t1_c047om0,1425855000.0,sakebomb69,"Hey little Mussolini, how's it going?\n\nAnybody else you've ""banned"" from stating an opposing viewpoint?\n\nOh, the irony. For all of your little bullshit 9/11 conspiracy cover up crap, You're trying to bury your opposition the same way you accuse the government. \n\n",1
2,c049fa3,1212774934,AmericanPolitics,t3_6mbh5,t1_c049fa3,t5_2qied,t3_6mbh5,1425856000.0,[deleted],Steppin' Up &amp; Speakin' Out - OOOOYAAAHH! My new 'Hottie' Hero!,0
3,c04aa8p,1212969927,AmericanPolitics,t3_6mjpb,t1_c04aa8p,t5_2qied,t3_6mjpb,1425857000.0,BravoLima,A Nazi in the (pocket) is worth four in the Bush (family) \r\nhttp://www.informationclearinghouse.info/article3255.htm\r\n,1
4,c04ajos,1213030813,AmericanPolitics,t3_6mmwd,t1_c04ajos,t5_2qied,t3_6mmwd,1425857000.0,MyaloMark,Anyone unwilling to do the fighting themselves should just shut the fuck up. This goes double for Liz Cheney and family.,1


In [5]:
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93165 entries, 0 to 93164
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            93165 non-null  object 
 1   created_utc   93165 non-null  object 
 2   subreddit     93165 non-null  object 
 3   link_id       93165 non-null  object 
 4   name          43341 non-null  object 
 5   subreddit_id  93165 non-null  object 
 6   parent_id     93165 non-null  object 
 7   retrieved_on  87739 non-null  float64
 8   author        93165 non-null  object 
 9   body          93165 non-null  object 
 10  score         93165 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 7.8+ MB


In [6]:
# Merge final_subs_df and final_comments_df on "name" and "link_id"
df = df_submissions.merge(df_comments, how='inner', left_on='name', right_on='link_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59148 entries, 0 to 59147
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   author_x        59148 non-null  object 
 1   id_x            59148 non-null  object 
 2   name_x          59148 non-null  object 
 3   created_utc_x   59148 non-null  object 
 4   subreddit_id_x  59148 non-null  object 
 5   permalink       59148 non-null  object 
 6   title           59148 non-null  object 
 7   selftext        59148 non-null  object 
 8   url             59135 non-null  object 
 9   retrieved_on_x  35557 non-null  float64
 10  id_y            59148 non-null  object 
 11  created_utc_y   59148 non-null  object 
 12  subreddit       59148 non-null  object 
 13  link_id         59148 non-null  object 
 14  name_y          35905 non-null  object 
 15  subreddit_id_y  59148 non-null  object 
 16  parent_id       59148 non-null  object 
 17  retrieved_on_y  53722 non-null 

In [7]:
final_df = df.copy()

# Convert timestamp columns to datetime format
timestamp_columns = ['created_utc_x', 'created_utc_y']
for col in timestamp_columns:
    final_df[col] = pd.to_datetime(final_df[col], unit='s')
    
final_df.head()    

  final_df[col] = pd.to_datetime(final_df[col], unit='s')
  final_df[col] = pd.to_datetime(final_df[col], unit='s')


Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,retrieved_on_x,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,retrieved_on_y,author_y,body,score
0,democracy101,ew8wk,t3_ew8wk,2011-01-04 22:36:37,t5_2qied,/r/AmericanPolitics/comments/ew8wk/why_did_the_republicans_not_wait_for_the_new/,Why did the Republicans not wait for the new Congress when they would have had a majority instead of reaching a compromise with Obama now?,,http://www.nullifynow.com/2011/01/why-didnt-they-wait/,,c1bggr0,2011-01-05 03:57:49,AmericanPolitics,t3_ew8wk,t1_c1bggr0,t5_2qied,t3_ew8wk,1426672000.0,RubyBlye,They didn't have to wait. They got what they wanted; extended tax cuts for the rich.,1
1,quasiperiodic,ex8o1,t3_ex8o1,2011-01-06 14:06:50,t5_2qied,/r/AmericanPolitics/comments/ex8o1/end_of_life_planning_axed_so_that_americans_can/,"end of life planning axed so that americans can die slowly, painfully, in senility, while paying big pharma big bucks.",,http://www.prospect.org/csnc/blogs/tapped_archive?month=01&amp;year=2011&amp;base_name=white_house_cowardice_on_healt,,c1bo686,2011-01-06 18:20:10,AmericanPolitics,t3_ex8o1,t1_c1bo686,t5_2qied,t3_ex8o1,1426676000.0,[deleted],"you mean he removed the mandatory death panels for seniors on government healthcare? victory for america, more like it!\n\n... /s",1
2,[deleted],ex65h,t3_ex65h,2011-01-06 10:08:23,t5_2qied,/r/AmericanPolitics/comments/ex65h/huck_finn_is_past_copyright_you_can_make_your_own/,"HUCK FINN is past copyright. You can make your \r\nown version using the ""N"" word or saying they are \r\n""Zombies"".",,http://www.reddit.com/r/AmericanPolitics/comments/ex65h/huck_finn_is_past_copyright_you_can_make_your_own/,,c1bmqf9,2011-01-06 10:13:31,AmericanPolitics,t3_ex65h,t1_c1bmqf9,t5_2qied,t3_ex65h,1426675000.0,rcadestaint,"This is just some publisher trying to make a politically correct version, and trying to get people all pissed off so he/she can sell copies. You can make your own version. If you need a reference copy, let me know. I think I have four or five copies sitting on my bookshelf.",1
3,shallah,ez8pz,t3_ez8pz,2011-01-10 01:33:39,t5_2qied,/r/AmericanPolitics/comments/ez8pz/nothing_to_see_here_folks_just_this_is_an/,Nothing to see here folks ... Just this is an astonishing list of violent rhetoric and political violence over the past two years.,,http://digbysblog.blogspot.com/2011/01/nothing-to-see-here-folks.html,,c1c4x9g,2011-01-10 06:27:56,AmericanPolitics,t3_ez8pz,t1_c1c4x9g,t5_2qied,t3_ez8pz,1426684000.0,[deleted],"when do we finally classify the tea party as a domestic terror organization? \n\nthe only thing make me want to do is buy a bunch of guns and ammo to protect myself from them, not from the government.",2
4,[deleted],ezwgp,t3_ezwgp,2011-01-11 01:06:47,t5_2qied,/r/AmericanPolitics/comments/ezwgp/the_christian_science_monitor_is_essentially_a/,"The Christian Science monitor is essentially a right wing (feigned centrism, neutrality) magazine. Not only do they use the Tucson shooting as an excuse to perpetuate false equivalencies about Right v. Left, they also demonize Marijuana",,http://news.yahoo.com/s/csm/20110110/cm_csm/355327;_ylt=AsHWAWG31LmPrfLaAplF1AGs0NUE;_ylu=X3oDMTFlb20wOXZvBHBvcwMyMTgEc2VjA2FjY29yZGlvbl9vcGluaW9uBHNsawNhcml6b25hc2hvb3Q-,,c1c8i22,2011-01-11 01:07:21,AmericanPolitics,t3_ezwgp,t1_c1c8i22,t5_2qied,t3_ezwgp,1426685000.0,[deleted],[deleted],1


In [8]:
final_df = final_df.drop(['retrieved_on_x', 'retrieved_on_y'], axis=1)

final_df.head()

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,author_y,body,score
0,democracy101,ew8wk,t3_ew8wk,2011-01-04 22:36:37,t5_2qied,/r/AmericanPolitics/comments/ew8wk/why_did_the_republicans_not_wait_for_the_new/,Why did the Republicans not wait for the new Congress when they would have had a majority instead of reaching a compromise with Obama now?,,http://www.nullifynow.com/2011/01/why-didnt-they-wait/,c1bggr0,2011-01-05 03:57:49,AmericanPolitics,t3_ew8wk,t1_c1bggr0,t5_2qied,t3_ew8wk,RubyBlye,They didn't have to wait. They got what they wanted; extended tax cuts for the rich.,1
1,quasiperiodic,ex8o1,t3_ex8o1,2011-01-06 14:06:50,t5_2qied,/r/AmericanPolitics/comments/ex8o1/end_of_life_planning_axed_so_that_americans_can/,"end of life planning axed so that americans can die slowly, painfully, in senility, while paying big pharma big bucks.",,http://www.prospect.org/csnc/blogs/tapped_archive?month=01&amp;year=2011&amp;base_name=white_house_cowardice_on_healt,c1bo686,2011-01-06 18:20:10,AmericanPolitics,t3_ex8o1,t1_c1bo686,t5_2qied,t3_ex8o1,[deleted],"you mean he removed the mandatory death panels for seniors on government healthcare? victory for america, more like it!\n\n... /s",1
2,[deleted],ex65h,t3_ex65h,2011-01-06 10:08:23,t5_2qied,/r/AmericanPolitics/comments/ex65h/huck_finn_is_past_copyright_you_can_make_your_own/,"HUCK FINN is past copyright. You can make your \r\nown version using the ""N"" word or saying they are \r\n""Zombies"".",,http://www.reddit.com/r/AmericanPolitics/comments/ex65h/huck_finn_is_past_copyright_you_can_make_your_own/,c1bmqf9,2011-01-06 10:13:31,AmericanPolitics,t3_ex65h,t1_c1bmqf9,t5_2qied,t3_ex65h,rcadestaint,"This is just some publisher trying to make a politically correct version, and trying to get people all pissed off so he/she can sell copies. You can make your own version. If you need a reference copy, let me know. I think I have four or five copies sitting on my bookshelf.",1
3,shallah,ez8pz,t3_ez8pz,2011-01-10 01:33:39,t5_2qied,/r/AmericanPolitics/comments/ez8pz/nothing_to_see_here_folks_just_this_is_an/,Nothing to see here folks ... Just this is an astonishing list of violent rhetoric and political violence over the past two years.,,http://digbysblog.blogspot.com/2011/01/nothing-to-see-here-folks.html,c1c4x9g,2011-01-10 06:27:56,AmericanPolitics,t3_ez8pz,t1_c1c4x9g,t5_2qied,t3_ez8pz,[deleted],"when do we finally classify the tea party as a domestic terror organization? \n\nthe only thing make me want to do is buy a bunch of guns and ammo to protect myself from them, not from the government.",2
4,[deleted],ezwgp,t3_ezwgp,2011-01-11 01:06:47,t5_2qied,/r/AmericanPolitics/comments/ezwgp/the_christian_science_monitor_is_essentially_a/,"The Christian Science monitor is essentially a right wing (feigned centrism, neutrality) magazine. Not only do they use the Tucson shooting as an excuse to perpetuate false equivalencies about Right v. Left, they also demonize Marijuana",,http://news.yahoo.com/s/csm/20110110/cm_csm/355327;_ylt=AsHWAWG31LmPrfLaAplF1AGs0NUE;_ylu=X3oDMTFlb20wOXZvBHBvcwMyMTgEc2VjA2FjY29yZGlvbl9vcGluaW9uBHNsawNhcml6b25hc2hvb3Q-,c1c8i22,2011-01-11 01:07:21,AmericanPolitics,t3_ezwgp,t1_c1c8i22,t5_2qied,t3_ezwgp,[deleted],[deleted],1


In [9]:
final_df = final_df.drop(['author_x', 'id_x', 'name_x', 'subreddit_id_x',
                          'permalink', 'url', 'id_y', 'name_y', 'subreddit_id_y',
                          'parent_id', 'author_y'], axis=1)


In [10]:
final_df.columns

Index(['created_utc_x', 'title', 'selftext', 'created_utc_y', 'subreddit',
       'link_id', 'body', 'score'],
      dtype='object')

In [11]:
import re
import pandas as pd

# Define the dictionary mapping variations to the standard form
name_variations = {
  r'\b(?:george\s+w\.\s+bush|bush|george(?:\s+w\.\s+bush)?)\b': 'George W. Bush',
  r'\b(?:barack\s+obama|obama|barack(?:\s+obama)?)\b': 'Barack Obama',
  r'\b(?:bill\s+clinton|clinton|bill(?:\s+clinton)?)\b': 'Bill Clinton',
  r'\b(?:donald\s+trump|trump|donald(?:\s+trump)?)\b': 'Donald Trump',
  r'\b(?:joe\s+biden|biden|joe(?:\s+biden)?)\b': 'Joe Biden',  
}


# Function to replace variations with standard names
def standardize_names(text):
    for pattern, standard_name in name_variations.items():
        text = re.sub(pattern, standard_name, text, flags=re.IGNORECASE)
    return text


In [12]:
# Import necessary libraries
import spacy
from spacy import displacy
import pandas as pd
import re
from textblob import TextBlob

# Download the small English model for spaCy
spacy.cli.download("en_core_web_sm")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract persons (already standardized)
def extract_persons(text):
    # Combine all standardized names into one pattern
    pattern = r'\b(?:' + '|'.join([re.escape(name) for name in name_variations.values()]) + r')\b'
    # Find all matches of persons' names in the text
    persons_found = re.findall(pattern, text, flags=re.IGNORECASE)
    # Return unique persons' names
    return list(set(persons_found))



# Apply the extract_persons function to 'title', 'selftext', and 'body' columns
final_df['persons_title'] = final_df['title'].apply(extract_persons)
final_df['persons_selftext'] = final_df['selftext'].apply(extract_persons)
final_df['persons_body'] = final_df['body'].apply(extract_persons)

# Filter the DataFrame to include only rows where either 'persons_title' or 'persons_selftext' or 'persons_body' is not empty
filtered_df = final_df[(final_df['persons_title'].apply(len) > 0) | 
                       (final_df['persons_selftext'].apply(len) > 0) |
                       (final_df['persons_body'].apply(len) > 0)]

# # Compute sentiments for 'title', 'selftext', and 'body'
# filtered_df['sentiment_title'] = filtered_df['title'].apply(get_sentiment)
# filtered_df['sentiment_selftext'] = filtered_df['selftext'].apply(get_sentiment)
# filtered_df['sentiment_body'] = filtered_df['body'].apply(get_sentiment)

# Display the filtered DataFrame with sentiments
filtered_df.head()


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,created_utc_x,title,selftext,created_utc_y,subreddit,link_id,body,score,persons_title,persons_selftext,persons_body
92,2011-02-14 20:35:35,Should Donald Trump run for president?,,2011-02-14 22:41:26,AmericanPolitics,t3_flcgc,"Should the billionaire that made millions run for president?\n\nHe wouldn't win, not with a comb-over like that.",1,[Donald Trump],[],[]
255,2011-04-20 01:18:06,"Donald Trump is such an embarrassment to American Conservatives, they're accusing him of being a plant by the Democrats (video)",,2011-04-20 12:55:47,AmericanPolitics,t3_gu2an,donald trump is an robot controlled by an alien that looks like a toupee,2,[Donald Trump],[],[donald trump]
256,2011-04-20 01:18:06,"Donald Trump is such an embarrassment to American Conservatives, they're accusing him of being a plant by the Democrats (video)",,2011-04-20 20:22:13,AmericanPolitics,t3_gu2an,Everyone no matter what your party should vote for Ron Paul,1,[Donald Trump],[],[]
440,2011-06-28 08:54:25,Borrowing and spending the GOP way -- The big deficit facing the U.S. is mostly Republican in origin,,2011-06-28 12:13:43,AmericanPolitics,t3_ib5c3,"&gt;Consider the two signature GOP policies of George W. Bush's presidency: the wars and the tax cuts. Including debt service costs, Bush's wars have cost about $1.7 trillion to date. \n\nNow that's odd. They left out Bush's medicare prescription drug benefits: \n&gt;The White House released budget figures yesterday indicating that the new Medicare prescription drug benefit **will cost more than $1.2 trillion in the coming decade**, a much higher price tag than President Bush suggested when he narrowly won passage of the law in late 2003. \n[source](http://www.washingtonpost.com/wp-dyn/articles/A9328-2005Feb8.html) (written in 2005)\n\n That's a huge expense.\n\nNow I wonder why they would leave that out of the article? Aren't Republicans proud of Medicare Part D? Shouldn't the people know what other big expenses are adding to the deficit?\n\nMaybe this is just another one of those ""blame the Republicans"" articles so that the Democrats will feel like *their* guys are doing something that should be supported?",2,[],[],[George W. Bush]
453,2011-06-30 17:13:34,MSNBC suspends journalist over Barack Obama insult -- US cable news channel takes action after Mark Halperin calls US president 'kind of a dick',,2011-06-30 17:40:27,AmericanPolitics,t3_iderp,"So is it the fact that he said ""dick"" instead of saying ""I felt the president was being a little condescending and abrasive?""\n\nI think that is kind of ridiculous. We all know what ""kind of a dick"" means, and it's not as much of an insult as it is a criticism of his interactions with reporters in this particular press conference.",3,[Barack Obama],[],[]


In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4536 entries, 92 to 59084
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_utc_x     4536 non-null   datetime64[ns]
 1   title             4536 non-null   object        
 2   selftext          4536 non-null   object        
 3   created_utc_y     4536 non-null   datetime64[ns]
 4   subreddit         4536 non-null   object        
 5   link_id           4536 non-null   object        
 6   body              4536 non-null   object        
 7   score             4536 non-null   int64         
 8   persons_title     4536 non-null   object        
 9   persons_selftext  4536 non-null   object        
 10  persons_body      4536 non-null   object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 425.2+ KB


In [14]:
filtered_df.to_csv('C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\AmericanPolitics_ner.csv')