# Import packages

In [1]:
import pandas as pd
import requests
from psaw import PushshiftAPI
from datetime import datetime
import time

# Define function to retrieve comments from Pushshift API

In [212]:
def wsb_comments(start_date, num_year):
    month = 12
    year = int(num_year)
    start_dt = start_date # initial date
    dt_format = "%Y-%m-%d" # date format to convert
    df = pd.DataFrame()
    records = 0
    api = PushshiftAPI()
    
    for i in range(month*year):
        # calculate the end_date
        start_date = datetime.strptime(start_dt, dt_format)
        end_date = pd.Period(start_date,freq = 'M').end_time
    
        # convert end_date into str type
        end_dt = end_date.strftime(dt_format)
    
        # calculate start_epoch and end_epoch
        start_epoch = int(time.mktime(datetime.strptime(start_dt, dt_format).timetuple()))
        end_epoch = int(time.mktime(datetime.strptime(end_dt, dt_format).timetuple()))
    
        print("Start Time:", start_date, start_epoch)
        print("  End Time:", end_date, end_epoch)
        
        # extract comments from pushshift.io
        sub = list(api.search_comments(
                                        # define time range to search
                                        after=start_epoch,
                                        before=end_epoch,
                                        # define how the comments are sorted
                                        sort='desc',
                                        sort_type='score',
                                        # define subreddit to search
                                        subreddit='wallstreetbets',
                                        # define fields to return
                                        filter = ['author', 'body', 'created_utc', 'id', 'link_id', 
                                                  'score', 'stickied', 'subreddit', 'created', 'is_submitter'],
                                        limit=2000))
            
        # report total records collected
        records += len(sub)
        print("Extracted", records, "Comments.", '\n')
        
        # put it into a dataframe
        new_df = pd.DataFrame([thing.d_ for thing in sub])
        
         # combine records
        df = pd.concat([df, new_df], ignore_index = True)
    
        # add a month to start_date and turn it into str type
        start_date = start_date + pd.DateOffset(months = 1)
        start_dt = start_date.strftime(dt_format)
        
        # wait for 3 seconds
        time.sleep(3)
        
    # convert date format
    df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')
    df['created'] = df['created_utc'].dt.date
    
    return df

# Define function to retrieve submissions from Pushshift API

In [None]:
def wsb_submissions(start_date, num_year):
    month = 12
    year = int(num_year)
    start_dt = start_date # initial date
    dt_format = "%Y-%m-%d" # date format to convert
    df = pd.DataFrame()
    records = 0
    api = PushshiftAPI()
    
    for i in range(month*year):
        # calculate the end_date
        start_date = datetime.strptime(start_dt, dt_format)
        end_date = pd.Period(start_date,freq = 'M').end_time
    
        # convert end_date into str type
        end_dt = end_date.strftime(dt_format)
    
        # calculate start_epoch and end_epoch
        start_epoch = int(time.mktime(datetime.strptime(start_dt, dt_format).timetuple()))
        end_epoch = int(time.mktime(datetime.strptime(end_dt, dt_format).timetuple()))
    
        print("Start Time:", start_date, start_epoch)
        print("  End Time:", end_date, end_epoch)
        
        # extract submissions from pushshift.io
        sub = list(api.search_submissions(
                                        # define time range to search
                                        after=start_epoch,
                                        before=end_epoch,
                                        # define how the submissions are sorted
                                        sort='desc',
                                        sort_type='score',
                                        # define subreddit to search
                                        subreddit='wallstreetbets',
                                        # define fields to return
                                        filter = ['author', 'author_fullname', 'created_utc', 'full_link', 
                                                  'id','num_comments', 'selftext', 'stickied', 
                                                  'subreddit', 'title', 'created', 'removed_by_category'],
                                        limit=2000))
            
        # report total records collected
        records += len(sub)
        print("Extracted", records, "Submissions.", '\n')
        
        # put it into a dataframe
        new_df = pd.DataFrame([thing.d_ for thing in sub])
        
         # combine records
        df = pd.concat([df, new_df], ignore_index = True)
    
        # add a month to start_date and turn it into str type
        start_date = start_date + pd.DateOffset(months = 1)
        start_dt = start_date.strftime(dt_format)
        
        # wait for 3 seconds
        time.sleep(3)
        
    # convert date format
    df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')
    df['created'] = df['created_utc'].dt.date
    
    return df

# Retrieve 3 years of submissions and comments from r/wallstreetbets starting from 01/01/2019

In [213]:
submissions_19_21 = wsb_submissions('2019-01-01', 3)
comments_19_21 = wsb_comments('2019-01-01', 3)

Start Time: 2019-01-01 00:00:00 1546318800
  End Time: 2019-01-31 23:59:59.999999999 1548910800




Extracted 1703 Comments. 

Start Time: 2019-02-01 00:00:00 1548997200
  End Time: 2019-02-28 23:59:59.999999999 1551330000




Extracted 2888 Comments. 

Start Time: 2019-03-01 00:00:00 1551416400
  End Time: 2019-03-31 23:59:59.999999999 1554004800




Extracted 4195 Comments. 

Start Time: 2019-04-01 00:00:00 1554091200
  End Time: 2019-04-30 23:59:59.999999999 1556596800




Extracted 5044 Comments. 

Start Time: 2019-05-01 00:00:00 1556683200
  End Time: 2019-05-31 23:59:59.999999999 1559275200




Extracted 5815 Comments. 

Start Time: 2019-06-01 00:00:00 1559361600
  End Time: 2019-06-30 23:59:59.999999999 1561867200




Extracted 6749 Comments. 

Start Time: 2019-07-01 00:00:00 1561953600
  End Time: 2019-07-31 23:59:59.999999999 1564545600




Extracted 7452 Comments. 

Start Time: 2019-08-01 00:00:00 1564632000
  End Time: 2019-08-31 23:59:59.999999999 1567224000




Extracted 8669 Comments. 

Start Time: 2019-09-01 00:00:00 1567310400
  End Time: 2019-09-30 23:59:59.999999999 1569816000




Extracted 9710 Comments. 

Start Time: 2019-10-01 00:00:00 1569902400
  End Time: 2019-10-31 23:59:59.999999999 1572494400




Extracted 10794 Comments. 

Start Time: 2019-11-01 00:00:00 1572580800
  End Time: 2019-11-30 23:59:59.999999999 1575090000




Extracted 12360 Comments. 

Start Time: 2019-12-01 00:00:00 1575176400
  End Time: 2019-12-31 23:59:59.999999999 1577768400




Extracted 12950 Comments. 

Start Time: 2020-01-01 00:00:00 1577854800
  End Time: 2020-01-31 23:59:59.999999999 1580446800




Extracted 14007 Comments. 

Start Time: 2020-02-01 00:00:00 1580533200
  End Time: 2020-02-29 23:59:59.999999999 1582952400




Extracted 15042 Comments. 

Start Time: 2020-03-01 00:00:00 1583038800
  End Time: 2020-03-31 23:59:59.999999999 1585627200




Extracted 16587 Comments. 

Start Time: 2020-04-01 00:00:00 1585713600
  End Time: 2020-04-30 23:59:59.999999999 1588219200




Extracted 17706 Comments. 

Start Time: 2020-05-01 00:00:00 1588305600
  End Time: 2020-05-31 23:59:59.999999999 1590897600




Extracted 19449 Comments. 

Start Time: 2020-06-01 00:00:00 1590984000
  End Time: 2020-06-30 23:59:59.999999999 1593489600




Extracted 20998 Comments. 

Start Time: 2020-07-01 00:00:00 1593576000
  End Time: 2020-07-31 23:59:59.999999999 1596168000




Extracted 22392 Comments. 

Start Time: 2020-08-01 00:00:00 1596254400
  End Time: 2020-08-31 23:59:59.999999999 1598846400




Extracted 23898 Comments. 

Start Time: 2020-09-01 00:00:00 1598932800
  End Time: 2020-09-30 23:59:59.999999999 1601438400




Extracted 25027 Comments. 

Start Time: 2020-10-01 00:00:00 1601524800
  End Time: 2020-10-31 23:59:59.999999999 1604116800




Extracted 26134 Comments. 

Start Time: 2020-11-01 00:00:00 1604203200
  End Time: 2020-11-30 23:59:59.999999999 1606712400




Extracted 27871 Comments. 

Start Time: 2020-12-01 00:00:00 1606798800
  End Time: 2020-12-31 23:59:59.999999999 1609390800
Extracted 29171 Comments. 

Start Time: 2021-01-01 00:00:00 1609477200
  End Time: 2021-01-31 23:59:59.999999999 1612069200
Extracted 31168 Comments. 

Start Time: 2021-02-01 00:00:00 1612155600
  End Time: 2021-02-28 23:59:59.999999999 1614488400
Extracted 31940 Comments. 

Start Time: 2021-03-01 00:00:00 1614574800
  End Time: 2021-03-31 23:59:59.999999999 1617163200
Extracted 33922 Comments. 

Start Time: 2021-04-01 00:00:00 1617249600
  End Time: 2021-04-30 23:59:59.999999999 1619755200
Extracted 35818 Comments. 

Start Time: 2021-05-01 00:00:00 1619841600
  End Time: 2021-05-31 23:59:59.999999999 1622433600
Extracted 36829 Comments. 

Start Time: 2021-06-01 00:00:00 1622520000
  End Time: 2021-06-30 23:59:59.999999999 1625025600
Extracted 38146 Comments. 

Start Time: 2021-07-01 00:00:00 1625112000
  End Time: 2021-07-31 23:59:59.999999999 1627704000
Extracte



Extracted 40900 Comments. 

Start Time: 2021-09-01 00:00:00 1630468800
  End Time: 2021-09-30 23:59:59.999999999 1632974400
Extracted 41486 Comments. 

Start Time: 2021-10-01 00:00:00 1633060800
  End Time: 2021-10-31 23:59:59.999999999 1635652800
Extracted 42722 Comments. 

Start Time: 2021-11-01 00:00:00 1635739200
  End Time: 2021-11-30 23:59:59.999999999 1638248400




Extracted 43928 Comments. 

Start Time: 2021-12-01 00:00:00 1638334800
  End Time: 2021-12-31 23:59:59.999999999 1640926800




Extracted 44622 Comments. 



# Write retrieved comments and submissions to csv files

In [215]:
comments_19_21.to_csv('Csv files\wsb_hot_comments_19_21.csv', index=False)
submissions_19_21.to_csv('Csv files\wsb_hot_subs_19_21.csv', index=False)