[Summary presentation slides](https://drive.google.com/file/d/1aKTlZQGOroZ7We19O0MSVogCMj3kWG5g/view?usp=sharing)  
[R codes used to conduct analysis](http://rpubs.com/DCHOW99/940482)

# The follow codes were used to retrieve posts and comments from r/wallstreetbets from Pushshift API using Python

# Import packages

In [1]:
import pandas as pd
import requests
from psaw import PushshiftAPI
from datetime import datetime
import time

# Define function to retrieve comments from Pushshift API

In [212]:
def wsb_comments(start_date, num_year):
    month = 12
    year = int(num_year)
    start_dt = start_date # initial date
    dt_format = "%Y-%m-%d" # date format to convert
    df = pd.DataFrame()
    records = 0
    api = PushshiftAPI()
    
    for i in range(month*year):
        # calculate the end_date
        start_date = datetime.strptime(start_dt, dt_format)
        end_date = pd.Period(start_date,freq = 'M').end_time
    
        # convert end_date into str type
        end_dt = end_date.strftime(dt_format)
    
        # calculate start_epoch and end_epoch
        start_epoch = int(time.mktime(datetime.strptime(start_dt, dt_format).timetuple()))
        end_epoch = int(time.mktime(datetime.strptime(end_dt, dt_format).timetuple()))
    
        print("Start Time:", start_date, start_epoch)
        print("  End Time:", end_date, end_epoch)
        
        # extract comments from pushshift.io
        sub = list(api.search_comments(
                                        # define time range to search
                                        after=start_epoch,
                                        before=end_epoch,
                                        # define how the comments are sorted
                                        sort='desc',
                                        sort_type='score',
                                        # define subreddit to search
                                        subreddit='wallstreetbets',
                                        # define fields to return
                                        filter = ['author', 'body', 'created_utc', 'id', 'link_id', 
                                                  'score', 'stickied', 'subreddit', 'created', 'is_submitter'],
                                        limit=2000))
            
        # report total records collected
        records += len(sub)
        print("Extracted", records, "Comments.", '\n')
        
        # put it into a dataframe
        new_df = pd.DataFrame([thing.d_ for thing in sub])
        
         # combine records
        df = pd.concat([df, new_df], ignore_index = True)
    
        # add a month to start_date and turn it into str type
        start_date = start_date + pd.DateOffset(months = 1)
        start_dt = start_date.strftime(dt_format)
        
        # wait for 3 seconds
        time.sleep(3)
        
    # convert date format
    df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')
    df['created'] = df['created_utc'].dt.date
    
    return df

# Define function to retrieve submissions from Pushshift API

In [None]:
def wsb_submissions(start_date, num_year):
    month = 12
    year = int(num_year)
    start_dt = start_date # initial date
    dt_format = "%Y-%m-%d" # date format to convert
    df = pd.DataFrame()
    records = 0
    api = PushshiftAPI()
    
    for i in range(month*year):
        # calculate the end_date
        start_date = datetime.strptime(start_dt, dt_format)
        end_date = pd.Period(start_date,freq = 'M').end_time
    
        # convert end_date into str type
        end_dt = end_date.strftime(dt_format)
    
        # calculate start_epoch and end_epoch
        start_epoch = int(time.mktime(datetime.strptime(start_dt, dt_format).timetuple()))
        end_epoch = int(time.mktime(datetime.strptime(end_dt, dt_format).timetuple()))
    
        print("Start Time:", start_date, start_epoch)
        print("  End Time:", end_date, end_epoch)
        
        # extract submissions from pushshift.io
        sub = list(api.search_submissions(
                                        # define time range to search
                                        after=start_epoch,
                                        before=end_epoch,
                                        # define how the submissions are sorted
                                        sort='desc',
                                        sort_type='score',
                                        # define subreddit to search
                                        subreddit='wallstreetbets',
                                        # define fields to return
                                        filter = ['author', 'author_fullname', 'created_utc', 'full_link', 
                                                  'id','num_comments', 'selftext', 'stickied', 
                                                  'subreddit', 'title', 'created', 'removed_by_category'],
                                        limit=2000))
            
        # report total records collected
        records += len(sub)
        print("Extracted", records, "Submissions.", '\n')
        
        # put it into a dataframe
        new_df = pd.DataFrame([thing.d_ for thing in sub])
        
         # combine records
        df = pd.concat([df, new_df], ignore_index = True)
    
        # add a month to start_date and turn it into str type
        start_date = start_date + pd.DateOffset(months = 1)
        start_dt = start_date.strftime(dt_format)
        
        # wait for 3 seconds
        time.sleep(3)
        
    # convert date format
    df['created_utc'] = pd.to_datetime(df['created_utc'],unit='s')
    df['created'] = df['created_utc'].dt.date
    
    return df

# Retrieve 3 years of submissions and comments from r/wallstreetbets starting from 01/01/2019

In [None]:
submissions_19_21 = wsb_submissions('2019-01-01', 3)
comments_19_21 = wsb_comments('2019-01-01', 3)

# Write retrieved comments and submissions to csv files

In [215]:
comments_19_21.to_csv('Csv files\wsb_hot_comments_19_21.csv', index=False)
submissions_19_21.to_csv('Csv files\wsb_hot_subs_19_21.csv', index=False)