In [3]:
import pandas as pd
import numpy as np
import statistics, nltk, math, json, requests, itertools, time, praw, tqdm, csv, os

from praw.models import MoreComments
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Getting Stock Information

In [15]:
size = 'compact' # 'full' for complete historical data, 'compact' for most recent 100
ticker = ['GME', 'SPY', 'TWTTR', 'TSLA', 'AMD'] # stock tickers to search for
datatype = 'csv' # 'json' for JSON output, 'csv' for CSV output

for stock in ticker:
    url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={stock}&outputsize={size}&datatype={datatype}&apikey=QC1C7LRPUTLC597Q'
    response = requests.get(url)
    #Save CSV to file
    with open(f'data/{stock}.csv', 'wb') as file:
        file.write(response.content)

## Charting Data

In [5]:
amd_data = pd.read_csv("AMD.csv")
amd_data

Unnamed: 0,timestamp,open,high,low,close,volume
0,2022-04-29,88.05,91.790,85.3800,85.52,82647701
1,2022-04-28,86.67,90.580,84.7800,89.64,91495449
2,2022-04-27,84.25,87.900,84.0200,84.91,83125054
3,2022-04-26,89.74,90.120,85.0800,85.16,87805574
4,2022-04-25,89.86,91.370,88.6100,90.69,93481042
...,...,...,...,...,...,...
95,2021-12-13,138.25,139.400,133.4150,133.80,42173963
96,2021-12-10,141.29,141.365,135.8200,138.55,42224275
97,2021-12-09,145.16,146.690,137.8000,138.10,53019926
98,2021-12-08,144.96,147.040,142.7000,145.24,40977478


## Getting Reddit Comment Data

In [6]:
reddit = praw.Reddit(
    client_id="p1dG7hgoowK4BSlUdar1WQ",
    client_secret="pEePtSnw7KMDZi6fCzkKaOth_pgKpQ",
    password="outdoortuesday",
    user_agent="Big Data by u/DISWillJayminMaya ",
    username="DISWillJayminMaya ",
    prawcore_timeout = 5
)

tickerlist = ['GME', 'Gamestop', 'SPY', 'TWTTR', 'Twitter', 'TSLA', 'Tesla', 'AMD']

## Making a function to return all comments that mention a stock based on a given date range

In [7]:
def make_request(uri, max_retries = 5):
    """
    Function taken from medium article:
    https://medium.com/@pasdan/how-to-scrap-reddit-using-pushshift-io-via-python-a3ebcc9b83f4
    """
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

def get_intervals(startDate, endDate, daysInInterval = 1):
    """
    get_intervals goes day by day through the start and end dates, returning that day's POSTIX
    """
    # Converting start and end dates to POSTIX:
    startPOSTIX = math.floor(startDate.timestamp())
    endPOSTIX = math.floor(endDate.timestamp())
    # 86,400 seconds in a day:
    period = (86400 * daysInInterval)
    end = startPOSTIX + period
    
    yield(int(startPOSTIX), int(end))
    
    padding = 1
    while end <= endPOSTIX:
        startPOSTIX = end + padding
        end = (startPOSTIX - padding) + period
        yield int(startPOSTIX), int(end)
    
    
def pull_posts_for(subreddit, start_at, end_at):
    """
    Function taken from medium article:
    https://medium.com/@pasdan/how-to-scrap-reddit-using-pushshift-io-via-python-a3ebcc9b83f4
    """
    def map_posts(posts):
        return list(map(lambda post: {
            'id': post['id'],
            'created_utc': post['created_utc'],
            'prefix': 't4_'
        }, posts))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    
    post_collections = map_posts(
        make_request(URI_TEMPLATE.format
                     (subreddit, start_at, end_at, SIZE))['data'])
    n = len(post_collections)
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        more_posts = map_posts( \
            make_request( \
                URI_TEMPLATE.format( \
                    subreddit, new_start_at, end_at, bSIZE))['data'])
        
        n = len(more_posts)
        post_collections.extend(more_posts)
    return post_collections

def get_comments_by_date (startDate, endDate, subreddit='wallstreetbets'):
    """
    Takes a given time interval and scrapes the given subreddit for all of the comments
    that relate to the given ticker name, returning them as an array. Basic structure taken 
    from medium article.
    
    THIS CURRENTLY WILL NOT WORK IF GIVEN TODAY'S DATE. IT WILL ATTEMPT TO FETCH TOMORROW'S POSTS FOREVER
    """
    posts = []
    # This loop gets all of the posts in the given timeframe
    for interval in get_intervals(startDate, endDate):
        print("-- Fetching Posts From: ", datetime.fromtimestamp(interval[0]), " to ", datetime.fromtimestamp(interval[1]))
        pulled_posts = pull_posts_for(subreddit, interval[0], interval[1])
        posts.extend(pulled_posts)
    
    reddit_posts = []
    reddit_comments = {}
    startIndex = "{}-{}-{}".format(startDate.year, startDate.month, startDate.day)
    reddit_comments[startIndex] = []
    
    # Going through each unique post and comment and adding them to the relevant arrays
    #  WARNING: only looking at first 100 posts of each day
    for sub_id in np.unique([post['id'] for post in posts])[:100]:
        # Only looking at posts with more than 100 upvotes to speed the process up
        if reddit.submission(sub_id).ups > 100:
            sub = reddit.submission(id=sub_id)
            reddit_posts.append(sub)
            sub.comments.replace_more(limit=None)
            # Looping through each comment:
            temp_com_count = 0
            for comment in sub.comments.list()[:100]: 
                temp_com_count += 1
                reddit_comments[startIndex].append(comment.body)
                
            print("---- Fethced {} comments from post {}".format(temp_com_count, sub_id))

    return reddit_comments

In [8]:
tickerlist = ['GME', 'Gamestop', 'SPY', 'TWTTR', 'Twitter', 'TSLA', 'Tesla', 'AMD']

def make_all_comments(date_range):
    all_coms = {}

    for day in date_range:
        end_day = day
        start_day = day - timedelta(days=1)
        temp_all_coms = get_comments_by_date(start_day, end_day)
        all_comments.update(temp_all_coms)
        
    return all_coms

def make_ticker_dictionary(tickerlist):
    ticker_dict = {}

    # Filling ticker_dict with empty dictionaries
    for tick in tickerlist:
        ticker_dict[tick] = {}

        # Filling the dictionaries in ticker_dict with empty lists
    for tick in tickerlist:
        for key in all_comments.keys():
            ticker_dict[tick][key] = []

    # Adding the comments to their ticker and date
    for tick in tickerlist:
        for key in all_comments.keys():
            for com in all_comments[key]:
                if tick in com:
                    ticker_dict[tick][key].append(com)

In [9]:
# Jan 11 - Jan 27 2021 For GME Boom
dates = []

# Filling array with dates (should be 11, 28 for GME Boom)
# for day in range(11, 28):
#     date_range.append(datetime(2021, 1, day))
    
for day in range(10, 29):
    dates.append(datetime(2022, 2, day))
    
for day in range(1, 32):
    dates.append(datetime(2022, 3, day))
    
for day in range(1, 31):
    dates.append(datetime(2022, 4, day))

In [10]:
comment_data = pd.read_csv('data/2022-02_2022-03_comment_data.csv')
# comment_data = pd.DataFrame.from_dict(ticker_dict)
# comment_data.to_csv('data/2022-02_2022-03_comment_data.csv')

## Making the Data Frames

<b>Sentiment Analysis and dataframe construction:

In [11]:
def get_sent_scores(com_list):
    analyzer = SentimentIntensityAnalyzer()
    scores = []
    
    for com in com_list:
        score = analyzer.polarity_scores(com)
        compScore = score['compound']
        scores.append(compScore)
            
    return scores

def build_ticker_df(com_data, ticker):
    df = pd.DataFrame()
    # Can be changed if not reading from a csv:
    index = com_data["Unnamed: 0"]
    df.index = index
    
    mean_scores = []
    scores_spread = []
    num_mentions = []
    
    for i in range(len(index)):
        row_coms = com_data[ticker][i].split(',')
        sent_scores = get_sent_scores(row_coms)
        mean_scores.append(statistics.mean(sent_scores))
        if len(sent_scores) > 1:
            scores_spread.append(statistics.stdev(sent_scores))
        else:
            scores_spread.append(0)
        num_mentions.append(len(row_coms))
        
    df['Mention Count'] = num_mentions
    df['Mean Sent Score'] = mean_scores
    df['Scores StdDev'] = scores_spread
    
    return df

<b>Building dataframes for each ticker:

In [12]:
tickerlist = ['GME', 'SPY', 'TWTTR', 'Twitter', 'TSLA', 'Tesla', 'AMD']
ticker_dfs = {}

for tick in tickerlist:
    ticker_dfs[tick] = (build_ticker_df(comment_data, tick))

In [14]:
# final_data = pd.DataFrame.from_dict(ticker_dict)
ticker_dfs['GME'].to_csv('gme_comment_data.csv')

In [29]:
merged_df = pd.read_csv('data/merged_gme.csv')
merged_df

Unnamed: 0.2,Unnamed: 0,timestamp,open,high,low,close,volume,change,Unnamed: 0.1,Mention Count,Mean Sent Score,Scores StdDev
0,0,2022-01-31,97.87,109.8199,97.86,108.93,3499273,11.06,2022-1-31,11,-0.043973,0.298081
1,1,2022-02-01,113.01,116.65,108.2649,112.6,3461877,-0.41,2022-2-1,38,0.051024,0.259325
2,2,2022-02-02,110.35,111.8599,98.06,100.04,3279135,-10.31,2022-2-2,36,-0.039461,0.477875
3,3,2022-02-03,101.5,106.94,97.71,99.23,2677519,-2.27,2022-2-3,1,0.0,0.0
4,4,2022-02-04,99.01,104.0,95.08,102.34,1906791,3.33,2022-2-4,16,-0.064706,0.437461
5,5,2022-02-07,102.99,105.3849,98.77,102.34,1759360,-0.65,2022-2-5,21,-0.05009,0.304355
6,6,2022-02-08,101.755,117.6717,100.55,115.6,4959906,13.845,2022-2-6,32,-0.066828,0.422273
7,7,2022-02-09,114.09,124.71,113.0,124.29,3866895,10.2,2022-2-9,13,-0.027869,0.290657
8,8,2022-02-10,117.95,131.78,116.0,122.47,3706537,4.52,2022-2-10,7,0.193457,0.256715
9,9,2022-02-11,122.48,129.8668,120.7706,124.25,2824777,1.77,2022-2-11,1,0.0,0.0
