In [1]:
# Essentials
import datetime
import requests
import pandas as pd
import numpy as np
import re
import time

#sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
"""
Pulls 100 comments per day containing query

Outputs as 'comment_list'

Body = comment
Subreddit = subreddit
created_utc = time of comment creation
score = reddit score for comment

"""

query="Biden" # set your query
comment_list = []
days_to_scrape = 365
# range size determines # of days scraped
for i in range(0,days_to_scrape+1):
    # size capped at 100 per request
    url = f"https://api.pushshift.io/reddit/search/comment/?q={query}&after={i}d&size=100"
    resp = requests.get(url)
    # not confident that this if/else is working as intended
    if resp.ok:
        # 0.5s is working could try lower    
        time.sleep(0.5)
        json_response = resp.json()
        api_data = json_response['data']
        for comment in api_data:
            comment_list.append([comment['body'], comment['subreddit'], comment['created_utc'], comment['score']])
    else:
        print(resp)
        
    if i == round(days_to_scrape/2):
        print('50%')
        
    if i == round(days_to_scrape/1.33):
        print('75%')

50%
75%


In [3]:
"""
Bringing API data into a dataframe

some minor cleaning to drop duplicates found in larger pulls

"""

data = pd.DataFrame(comment_list, columns=(['body', 'subreddit', 'created_utc', 'score']))
# drop duplicates, dunno why they're in there tbh
# maybe days with <100 comments are reaching into the day prior
print('number of dupes dropped:' + str(data.duplicated().sum()))
data = data.drop_duplicates(ignore_index=True)
data

number of dupes dropped:0


Unnamed: 0,body,subreddit,created_utc,score
0,do tell us on how biden plans to raise the min...,WhitePeopleTwitter,1615924597,-7
1,&gt; IF the current administration ends up fog...,personalfinance,1615924618,1
2,"For the record, I didn't see anyone in the vid...",TikTokCringe,1615924624,1
3,Maybe “Biden” will pay for it,mercedes_benz,1615924638,1
4,Most of them are on paid leave of absence. Hav...,IRS,1615924645,1
...,...,...,...,...
495,Really funny that Trump basically did this as ...,PoliticalCompassMemes,1615579347,3
496,Do they even get that? Biden isn’t the one de...,Conservative,1615579364,-2
497,"Following in Trumps footsteps, Biden has not d...",politics,1615579377,1
498,[How many badly needed respirators has Biden e...,PoliticalHumor,1615579384,1


In [4]:
"""
Initialize Vader for sentiment analysis

"""
analyzer = SentimentIntensityAnalyzer()

In [5]:
"""
-Adding sentiment analysis to data
-Extracting date from utc format

"""
df = data.copy()
# produce sentiment analysis on body
df['sentiment'] = df.body.apply(analyzer.polarity_scores)
# extract compound sentiment as col
df['compound'] = df.sentiment.apply(lambda x : x.get('compound'))
# produce date from epoch time
df['date'] = df['created_utc'].apply(datetime.date.fromtimestamp)

In [6]:
"""
Saving data for future use.
"""
# df.to_csv(r'C:\Users\Sasha\Documents\GitHub\concordia-bootcamps\m4 project\biden_data.csv')

'\nSaving data for future use.\n'

In [7]:
"""
Using DF from saved CSV to save time
"""

df = pd.read_csv('data.csv') 
df

Unnamed: 0.1,Unnamed: 0,body,subreddit,created_utc,score,sentiment,compound,date
0,0,I don't think that the founders had any idea w...,ParlerWatch,1615758489,1,"{'neg': 0.126, 'neu': 0.779, 'pos': 0.095, 'co...",-0.9146,2021-03-14
1,1,and every us president since except trump.,PoliticalHumor,1615758490,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,2021-03-14
2,2,"Trump is gone, Biden removed DFC, it is alread...",europe,1615758501,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,2021-03-14
3,3,"No I wasn't aware of that, I'm not a Biden fan...",AskReddit,1615758513,1,"{'neg': 0.148, 'neu': 0.736, 'pos': 0.116, 'co...",-0.7231,2021-03-14
4,4,They would not speak to trump either for the p...,worldnews,1615758515,1,"{'neg': 0.203, 'neu': 0.797, 'pos': 0.0, 'comp...",-0.6222,2021-03-14
...,...,...,...,...,...,...,...,...
35577,35577,It's been going on for several months but it s...,trump,1584309542,1,"{'neg': 0.17, 'neu': 0.692, 'pos': 0.138, 'com...",-0.8343,2020-03-15
35578,35578,Trump is my daddy.,wallstreetbets,1584309542,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,2020-03-15
35579,35579,Fuck trump. POS,EverythingScience,1584309543,1,"{'neg': 0.636, 'neu': 0.364, 'pos': 0.0, 'comp...",-0.5423,2020-03-15
35580,35580,&gt;It would be the official end of America as...,politics,1584309544,1,"{'neg': 0.0, 'neu': 0.78, 'pos': 0.22, 'compou...",0.7351,2020-03-15
