# importing Data and Loading Submission File

In [1]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\obama_submissions'

# Call the function to read the zst folder into a DataFra
df_submissions = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


# Now you can work with the DataFrame (df)
df_submissions = df_submissions[['author', 'id','name', 'created_utc', 'subreddit_id', 'permalink', 'title', 'selftext', 'url', 'retrieved_on']]
df_submissions.head()

  from pandas.core import (


Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
0,hakku,1a8q8,,1173894779,t5_1a8ah,/r/obama/comments/1a8q8/obama_to_serve_as_snhus_2007_commencement_speaker/,Obama to Serve as SNHU’s 2007 Commencement Speaker,,http://www.snhu.edu/6736.asp,1522508000.0
1,r0gue,1a8qk,,1173894868,t5_1a8ah,/r/obama/comments/1a8qk/obama_on_the_senate_floor_speaking_on_military/,Obama on the Senate floor speaking on military tribunal bill [vid],,http://www.youtube.com/watch?v=dc6I3jnTRe0,1522508000.0
2,r0gue,1a8ss,,1173895255,t5_1a8ah,/r/obama/comments/1a8ss/obama_in_front_of_20000_supporters_in_austin_vid/,"Obama in front of 20,000+ supporters in Austin [vid]",,http://youtube.com/watch?v=NLvBXa-MONQ,1522508000.0
3,arisbe,1a8tq,,1173895364,t5_1a8ah,/r/obama/comments/1a8tq/historians_unearth_obamas_irish_roots/,Historians unearth Obama’s Irish roots,,http://www.timesonline.co.uk/tol/news/world/us_and_americas/article1512094.ece,1522508000.0
4,arisbe,1a8tu,,1173895390,t5_1a8ah,/r/obama/comments/1a8tu/will_you_leave_obama_alone_already/,Will you leave Obama alone already?,,http://www.haaretz.com/hasen/pages/rosnerBlog.jhtml?itemNo=837813&amp;contrassID=25&amp;subContrassID=0&amp;sbSubContrassID=1&amp;listSrc=Y&amp;art=1,1522508000.0


In [2]:
df_submissions.tail()

Unnamed: 0,author,id,name,created_utc,subreddit_id,permalink,title,selftext,url,retrieved_on
40180,BlankVerse,zxixjn,t3_zxixjn,1672256722,t5_1a8ah,/r/obama/comments/zxixjn/twitter_reacts_to_barack_obamas_favourite_music/,Twitter Reacts to Barack Obama's Favourite Music of 2022,,https://exclaim.ca/music/article/twitter_reacts_to_barack_obamas_favourite_music_of_2022,1673171000.0
40181,[deleted],zxldfv,t3_zxldfv,1672262446,t5_1a8ah,/r/obama/comments/zxldfv/trump_said_black_people_built_america/,Trump said black people Built America,[removed],,1673171000.0
40182,BlankVerse,zyce89,t3_zyce89,1672338467,t5_1a8ah,/r/obama/comments/zyce89/michelle_obama_could_not_stand_barack_during/,Michelle Obama 'could not stand' Barack during first 10 years of raising 'terrorist' children,,https://www.telegraph.co.uk/world-news/2022/12/29/michelle-obama-hated-barack-first-10-years-raising-terrorist/,1673170000.0
40183,BlankVerse,zz694h,t3_zz694h,1672421484,t5_1a8ah,/r/obama/comments/zz694h/barack_obama_mourns_pelé_in_emotional_condolence/,"Barack Obama Mourns Pelé in Emotional Condolence Message: ""Everyone Loved and Admired Him""",,https://www.tuko.co.ke/sports/football/488802-barack-obama-mourns-pele-emotional-condolence-message-loved-admired/,1673169000.0
40184,BlankVerse,zzyrka,t3_zzyrka,1672506838,t5_1a8ah,/r/obama/comments/zzyrka/michelle_obama_shares_kiss_with_barack_under/,Michelle Obama Shares Kiss with Barack Under Mistletoe in Christmas Photo,,https://people.com/politics/michelle-obama-shares-kiss-under-mistletoe-barack-christmas-photo/,1673168000.0


In [3]:
df_submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40185 entries, 0 to 40184
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   author        40185 non-null  object 
 1   id            40185 non-null  object 
 2   name          14135 non-null  object 
 3   created_utc   40185 non-null  object 
 4   subreddit_id  40185 non-null  object 
 5   permalink     40185 non-null  object 
 6   title         40185 non-null  object 
 7   selftext      40185 non-null  object 
 8   url           40176 non-null  object 
 9   retrieved_on  33029 non-null  float64
dtypes: float64(1), object(9)
memory usage: 3.1+ MB


# Importing Data and Loading Comment File

In [4]:
import pandas as pd
import zstandard
import os
import json

def read_zst_folder(folder_path):
    # Initialize an empty list to store DataFrame objects
    dfs = []

    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".zst"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read and decompress the zst file
            with open(file_path, 'rb') as f:
                decompressor = zstandard.ZstdDecompressor()
                with decompressor.stream_reader(f) as reader:
                    decompressed_data = reader.read()

            # Decode the decompressed data as JSON
            json_data = decompressed_data.decode('utf-8')
            
            # Parse the JSON data into a Python object (list of dictionaries)
            data = [json.loads(line) for line in json_data.split('\n') if line.strip()]

            # Convert the data to a DataFrame
            df = pd.DataFrame(data)
            
            # Append the DataFrame to the list
            dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

# Specify the path to the folder containing zst files
folder_path = 'C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\obama_comments'
# Call the function to read the zst folder into a DataFrame
df_comments = read_zst_folder(folder_path)
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# Now you can work with the DataFrame (df)
# df_comments.head()
df_comments = df_comments[['id', 'created_utc', 'subreddit', 'link_id', 'name', 'subreddit_id',
       'parent_id', 'retrieved_on', 'author', 'body', 'score']]
df_comments.head()


Unnamed: 0,id,created_utc,subreddit,link_id,name,subreddit_id,parent_id,retrieved_on,author,body,score
0,c1at2q,1174085570,obama,t3_1at2j,,t5_1a8ah,t3_1at2j,1473736000.0,r0gue,"We know Hillary is going to have deep pockets in this race, but ties to Texas aren't winning the hearts and minds of any Dems these days.",1
1,c1lwbq,1177855675,obama,t3_1ev8t,,t5_1a8ah,t3_1ev8t,1473744000.0,[deleted],[removed],0
2,c1lwe9,1177856082,obama,t3_1lwd6,,t5_1a8ah,t3_1lwd6,1473744000.0,[deleted],[deleted],-9
3,c1lwox,1177858807,obama,t3_1lwd6,,t5_1a8ah,t3_1lwd6,1473744000.0,hillbilly1,Can someone explain to me why someone who is in favor of free markets does not like NAFTA?,7
4,c1lwum,1177860295,obama,t3_1lwd6,,t5_1a8ah,t1_c1lwox,1473744000.0,reallythateasy,The proliferation of human rights abuses and the de-industrialization of the US might factor into his decision. From a theory standpoint globalization would work a lot more smoothly for the little guy if the different populations involved were closer in terms of education and income. Occasionally you will see a free market supporter criticize the speed at which policies supporting international trade are being adopted.,2


In [5]:
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139860 entries, 0 to 139859
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            139860 non-null  object 
 1   created_utc   139860 non-null  object 
 2   subreddit     139860 non-null  object 
 3   link_id       139860 non-null  object 
 4   name          126727 non-null  object 
 5   subreddit_id  139860 non-null  object 
 6   parent_id     139860 non-null  object 
 7   retrieved_on  139173 non-null  float64
 8   author        139860 non-null  object 
 9   body          139860 non-null  object 
 10  score         139860 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 11.7+ MB


In [6]:
# Merge final_subs_df and final_comments_df on "name" and "link_id"
df = df_submissions.merge(df_comments, how='inner', left_on='name', right_on='link_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44981 entries, 0 to 44980
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   author_x        44981 non-null  object 
 1   id_x            44981 non-null  object 
 2   name_x          44981 non-null  object 
 3   created_utc_x   44981 non-null  object 
 4   subreddit_id_x  44981 non-null  object 
 5   permalink       44981 non-null  object 
 6   title           44981 non-null  object 
 7   selftext        44981 non-null  object 
 8   url             44979 non-null  object 
 9   retrieved_on_x  15151 non-null  float64
 10  id_y            44981 non-null  object 
 11  created_utc_y   44981 non-null  object 
 12  subreddit       44981 non-null  object 
 13  link_id         44981 non-null  object 
 14  name_y          38025 non-null  object 
 15  subreddit_id_y  44981 non-null  object 
 16  parent_id       44981 non-null  object 
 17  retrieved_on_y  44294 non-null 

In [7]:
final_df = df.copy()

# Convert timestamp columns to datetime format
timestamp_columns = ['created_utc_x', 'created_utc_y']
for col in timestamp_columns:
    final_df[col] = pd.to_datetime(final_df[col], unit='s')
    
final_df.head()    

  final_df[col] = pd.to_datetime(final_df[col], unit='s')
  final_df[col] = pd.to_datetime(final_df[col], unit='s')


Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,retrieved_on_x,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,retrieved_on_y,author_y,body,score
0,[deleted],ev8uu,t3_ev8uu,2011-01-03 04:40:14,t5_1a8ah,/r/obama/comments/ev8uu/darrell_issa_takes_aim_at_obama_white_house_one/,Darrell Issa Takes Aim At Obama White House: 'One Of The Most Corrupt Administrations',,http://www.huffingtonpost.com/2011/01/02/darrell-issa-obama-corrupt_n_803331.html,,c1b8fxz,2011-01-03 05:17:00,obama,t3_ev8uu,t1_c1b8fxz,t5_1a8ah,t3_ev8uu,1426668000.0,tactlesswonder,WTF. You know you pissed off some people in power when your tagged as corrupt in your second year.,0
1,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,,c1bg0xd,2011-01-05 01:05:41,obama,t3_ew0j9,t1_c1bg0xd,t5_1a8ah,t3_ew0j9,1426672000.0,[deleted],I wish they'd conduct polls at my office. I'd like to know what my job approval rating is. I wonder how low it could get before I'd be fired.,6
2,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,,c1bg15v,2011-01-05 01:07:21,obama,t3_ew0j9,t1_c1bg15v,t5_1a8ah,t3_ew0j9,1426672000.0,Popperian,He lost me as of irradiate &amp; grope (which he approved directly).\n\nI'm dying to know who the 2012 Republican is going to be. Anyone have any idea?,-1
3,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,,c1bg2yu,2011-01-05 01:21:26,obama,t3_ew0j9,t1_c1bg2yu,t5_1a8ah,t1_c1bg15v,1426672000.0,eviljack,"Last I heard, something like 40% of republicans were hoping it would be Sarah Palin, whereas 100% of democrats were hoping it would be Sarah Palin.",6
4,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,,c1bg3kr,2011-01-05 01:29:03,obama,t3_ew0j9,t1_c1bg3kr,t5_1a8ah,t1_c1bg2yu,1426672000.0,Popperian,"I'd be surprised if it's Sarah Palin. They already tried the ""woman-like-me"" approach and it was a pretty huge failure.\n\nWho would be #2?",1


In [8]:
final_df = final_df.drop(['retrieved_on_x', 'retrieved_on_y'], axis=1)

final_df.head()

Unnamed: 0,author_x,id_x,name_x,created_utc_x,subreddit_id_x,permalink,title,selftext,url,id_y,created_utc_y,subreddit,link_id,name_y,subreddit_id_y,parent_id,author_y,body,score
0,[deleted],ev8uu,t3_ev8uu,2011-01-03 04:40:14,t5_1a8ah,/r/obama/comments/ev8uu/darrell_issa_takes_aim_at_obama_white_house_one/,Darrell Issa Takes Aim At Obama White House: 'One Of The Most Corrupt Administrations',,http://www.huffingtonpost.com/2011/01/02/darrell-issa-obama-corrupt_n_803331.html,c1b8fxz,2011-01-03 05:17:00,obama,t3_ev8uu,t1_c1b8fxz,t5_1a8ah,t3_ev8uu,tactlesswonder,WTF. You know you pissed off some people in power when your tagged as corrupt in your second year.,0
1,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,c1bg0xd,2011-01-05 01:05:41,obama,t3_ew0j9,t1_c1bg0xd,t5_1a8ah,t3_ew0j9,[deleted],I wish they'd conduct polls at my office. I'd like to know what my job approval rating is. I wonder how low it could get before I'd be fired.,6
2,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,c1bg15v,2011-01-05 01:07:21,obama,t3_ew0j9,t1_c1bg15v,t5_1a8ah,t3_ew0j9,Popperian,He lost me as of irradiate &amp; grope (which he approved directly).\n\nI'm dying to know who the 2012 Republican is going to be. Anyone have any idea?,-1
3,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,c1bg2yu,2011-01-05 01:21:26,obama,t3_ew0j9,t1_c1bg2yu,t5_1a8ah,t1_c1bg15v,eviljack,"Last I heard, something like 40% of republicans were hoping it would be Sarah Palin, whereas 100% of democrats were hoping it would be Sarah Palin.",6
4,joelrw,ew0j9,t3_ew0j9,2011-01-04 15:11:22,t5_1a8ah,/r/obama/comments/ew0j9/obama_job_approval_reaches_50_for_first_time/,Obama Job Approval Reaches 50% for First Time Since Spring,,http://www.gallup.com/poll/145442/Obama-Job-Approval-Reaches-First-Time-Spring.aspx?utm_source=add%2Bthis&amp;utm_medium=addthis.com&amp;utm_campaign=sharing&amp;utm_term=Obama-Job-Approval-Reaches-First-Time-Spring,c1bg3kr,2011-01-05 01:29:03,obama,t3_ew0j9,t1_c1bg3kr,t5_1a8ah,t1_c1bg2yu,Popperian,"I'd be surprised if it's Sarah Palin. They already tried the ""woman-like-me"" approach and it was a pretty huge failure.\n\nWho would be #2?",1


In [9]:
final_df = final_df.drop(['author_x', 'id_x', 'name_x', 'subreddit_id_x',
                          'permalink', 'url', 'id_y', 'name_y', 'subreddit_id_y',
                          'parent_id', 'author_y'], axis=1)


In [10]:
final_df.columns

Index(['created_utc_x', 'title', 'selftext', 'created_utc_y', 'subreddit',
       'link_id', 'body', 'score'],
      dtype='object')

In [11]:
import re
import pandas as pd

# Define the dictionary mapping variations to the standard form
name_variations = {
  r'\b(?:george\s+w\.\s+bush|bush|george(?:\s+w\.\s+bush)?)\b': 'George W. Bush',
  r'\b(?:barack\s+obama|obama|barack(?:\s+obama)?)\b': 'Barack Obama',
  r'\b(?:bill\s+clinton|clinton|bill(?:\s+clinton)?)\b': 'Bill Clinton',
  r'\b(?:donald\s+trump|trump|donald(?:\s+trump)?)\b': 'Donald Trump',
  r'\b(?:joe\s+biden|biden|joe(?:\s+biden)?)\b': 'Joe Biden',  
}


# Function to replace variations with standard names
def standardize_names(text):
    for pattern, standard_name in name_variations.items():
        text = re.sub(pattern, standard_name, text, flags=re.IGNORECASE)
    return text


In [12]:
# Import necessary libraries
import spacy
from spacy import displacy
import pandas as pd
import re
from textblob import TextBlob

# Download the small English model for spaCy
spacy.cli.download("en_core_web_sm")

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract persons (already standardized)
def extract_persons(text):
    # Combine all standardized names into one pattern
    pattern = r'\b(?:' + '|'.join([re.escape(name) for name in name_variations.values()]) + r')\b'
    # Find all matches of persons' names in the text
    persons_found = re.findall(pattern, text, flags=re.IGNORECASE)
    # Return unique persons' names
    return list(set(persons_found))



# Apply the extract_persons function to 'title', 'selftext', and 'body' columns
final_df['persons_title'] = final_df['title'].apply(extract_persons)
final_df['persons_selftext'] = final_df['selftext'].apply(extract_persons)
final_df['persons_body'] = final_df['body'].apply(extract_persons)

# Filter the DataFrame to include only rows where either 'persons_title' or 'persons_selftext' or 'persons_body' is not empty
filtered_df = final_df[(final_df['persons_title'].apply(len) > 0) | 
                       (final_df['persons_selftext'].apply(len) > 0) |
                       (final_df['persons_body'].apply(len) > 0)]

# # Compute sentiments for 'title', 'selftext', and 'body'
# filtered_df['sentiment_title'] = filtered_df['title'].apply(get_sentiment)
# filtered_df['sentiment_selftext'] = filtered_df['selftext'].apply(get_sentiment)
# filtered_df['sentiment_body'] = filtered_df['body'].apply(get_sentiment)

# Display the filtered DataFrame with sentiments
filtered_df.head()


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,created_utc_x,title,selftext,created_utc_y,subreddit,link_id,body,score,persons_title,persons_selftext,persons_body
88,2011-01-07 02:09:14,"Put on your seatbelt, Mr. President",,2011-01-07 10:06:15,obama,t3_exmny,"But, from the article: \n \n&gt; Both the president’s and Vice President Joe Biden’s motorcades were involved in accidents last year.",1,[],[],[Joe Biden]
103,2011-01-08 10:31:29,"For President Obama, 2012 Is Here: After a rough two years in office from a popularity standpoint, Barack Obama is starting to remind us all why no politician in the modern United States has risen as quickly and in as unlikely a fashion as he has.",,2011-01-08 16:41:37,obama,t3_eyezp,Obama is more popular than Reagan or Clinton was at the start of his third year:\n http://online.wsj.com/public/resources/documents/info-presapp0605-31.html,6,[Barack Obama],[],[]
104,2011-01-08 10:31:29,"For President Obama, 2012 Is Here: After a rough two years in office from a popularity standpoint, Barack Obama is starting to remind us all why no politician in the modern United States has risen as quickly and in as unlikely a fashion as he has.",,2011-01-08 18:16:21,obama,t3_eyezp,Obama's power has been underestimation. Discount him at your own risk. He begs you.,4,[Barack Obama],[],[]
105,2011-01-08 10:31:29,"For President Obama, 2012 Is Here: After a rough two years in office from a popularity standpoint, Barack Obama is starting to remind us all why no politician in the modern United States has risen as quickly and in as unlikely a fashion as he has.",,2011-01-08 19:37:41,obama,t3_eyezp,Um..isn't his popularity at like 50% right now?,1,[Barack Obama],[],[]
154,2011-01-12 03:59:22,Obama to speak Wednesday in Arizona... but what to say?,,2011-01-13 00:17:39,obama,t3_f0o2k,"""**The words that commanders in chief choose in times of national distress can define their presidencies**. Historians cite Abraham Lincoln at Gettysburg in 1863; Ronald Reagan after the 1986 Challenger disaster; Bill Clinton after the Oklahoma City bombing in 1995; and **George W. Bush's National Cathedral speech in the aftermath of the Sept. 11, 2001**"" - I don't think that defined his presidency",1,[],[],"[Bill Clinton, George W. Bush]"


In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5065 entries, 88 to 44979
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   created_utc_x     5065 non-null   datetime64[ns]
 1   title             5065 non-null   object        
 2   selftext          5065 non-null   object        
 3   created_utc_y     5065 non-null   datetime64[ns]
 4   subreddit         5065 non-null   object        
 5   link_id           5065 non-null   object        
 6   body              5065 non-null   object        
 7   score             5065 non-null   int64         
 8   persons_title     5065 non-null   object        
 9   persons_selftext  5065 non-null   object        
 10  persons_body      5065 non-null   object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 474.8+ KB


In [14]:
filtered_df.to_csv('C:\\Users\\HP\Desktop\\Middlesex Course Content\\Giovanni Proposed Projects\\US Political Subreddits Selection (19-07-2024)\\Selected Subreddits\\Whole Subreddits\\obama_ner.csv')