### Load the database

In [None]:
import pandas as pd
print(pd.__version__)

In [None]:
from sociophysicsDataHandler import SociophysicsDataHandler
import pandas as pd
import matplotlib.pyplot as pl

student_config = True

file_target = 'asdz/platform2.2/20200428/ASDZ_Perron2.2_2020042815_trajectorie.parquet' 

if student_config:
    dh = SociophysicsDataHandler()
    dh.fetch_prorail_data_from_path(file_target)
else:
    webdav_basepath='/Crowdflow (Projectfolder)/ProRail_USE_LL_data'
    dh = SociophysicsDataHandler(basepath=webdav_basepath)
    
    dh.fetch_prorail_data_from_path(file_target)
                           # ,basepath=webdav_basepath)

print('The available files are the following:')
dh.list_files("econophysics/reddit/")
for path in dh.filelist['path']:
    print(path)

In [None]:
targetPosts = True
stock = 'GME'

if targetPosts:
    filename = 'submissions_wallstreetbets_' + stock + '_start20200901_end20210706.json'
    if stock == 'GME':
        filename = 'submissions_wallstreetbets_GME_start20200901_end20210624.json'
else:
    filename = 'comments_' + stock + '.tar.gz'
    
dh.fetch_econophysics_data_from_path("econophysics/reddit/" + filename)
df = dh.df
# print one of the entries (in this case, the fifth):
df

for col in df.columns:
    print(col)
    
df

### Create time sorted reddit post df

In [None]:
redditPosts = df.set_index('created_utc')
redditPosts.sort_index(inplace=True)
redditPosts.index = pd.to_datetime(redditPosts.index, origin='unix', unit='s')

redditPosts
#This is now sorted on time

#Set rolling window to 5 days, by default this rolling window will take the right-most boundary as center
#Pretty aweosme
redditRollingAmount = redditPosts[['id']].rolling(window = '1H').count()
redditRollingAmount = redditRollingAmount.rename(columns={'id':'rolling_count'})

redditRollingAmount

### Stock preparation

In [None]:
dh.fetch_econophysics_data_from_path("econophysics/prices/hourly_prices.csv")
df = dh.df
df

In [None]:
dh.fetch_econophysics_data_from_path("econophysics/prices/hourly_prices.csv")
prices_hour = dh.df
prices_hour.index = pd.to_datetime(prices_hour.index) # to be sure that the index is in the pandas DateTime format

# the times you see in the index of prices_hour are expressed in New York time (American Eastern Time) 
# American Eastern Time is defined as UTC-5 in autumn and winter, and UTC-4 in spring and summer (daylight saving)
import datetime as dt
from dateutil import tz # library to treat timezones
NYC = tz.gettz('America/New_York') # define the New York timezone

new_index_list = []
for i in range(len(prices_hour)):
    # for each index, make the previous index transformation:
    old_index = prices_hour.index[i]
    dat = str(old_index.date())
    dt1 = dt.datetime(int(dat[0:4]), int(dat[5:7]),int(dat[8:10]), tzinfo=NYC)
    UTC_lag = dt1.utcoffset() / dt.timedelta(hours=1)
    new_index = prices_hour.index[i] - pd.Timedelta(hours=UTC_lag)
    # attach the UTC index to the new index list:
    new_index_list.append(new_index)
    
# set the new UTC index to the prices_hour dataframe:
prices_hour.index = new_index_list

prices_hour #corrected reddit time prices

In [None]:
import matplotlib.pyplot as pl
stock = 'GME'
start_month = '2021-01-20'
end_month = '2021-01-31'
prices_toPlot = prices_hour[[stock]][start_month:end_month]
prices_toPlot = prices_toPlot.dropna()

redditRollingAmount_toPlot = redditRollingAmount[['rolling_count']][start_month:end_month]

fig, axes = pl.subplots(nrows=2, ncols=1, sharex=True)

prices_toPlot.plot(ax=axes[0], color='g')
axes[0].set(xlabel='b')
axes[0].set(ylabel=f'Price of {stock}')
axes[0].set(title='GME price and post count comparison')
axes[0].get_legend().remove()

redditRollingAmount_toPlot.plot(ax=axes[1], color='r')
axes[1].set(xlabel='Time')
axes[1].set(ylabel=f'Rolling Post Count')
axes[1].get_legend().remove()

pl.show()

prices_toPlot

axes[0].axis()

In [None]:
before_peak_start = '2021-01-01'
before_peak_end = '2021-01-22'
after_peak_start = '2021-01-23'
after_peak_end = '2021-01-30'

redditPostsBeforePeak = redditPosts[['link_flair_text']][before_peak_start:before_peak_end].dropna()
redditPostsAfterPeak = redditPosts[['link_flair_text']][after_peak_start:after_peak_end].dropna()

countFlairsBefore = {}
for val in redditPostsBeforePeak['link_flair_text']:
    countFlairsBefore[val] = 0
    
countFlairsBefore = dict(sorted(countFlairsBefore.items()))

for r in redditPostsBeforePeak['link_flair_text']:
    countFlairsBefore[r] = countFlairsBefore[r] + 1
    
countFlairsAfter = {}
for val in redditPostsAfterPeak['link_flair_text']:
    countFlairsAfter[val] = 0
    
countFlairsAfter = dict(sorted(countFlairsAfter.items()))

for r in redditPostsAfterPeak['link_flair_text']:
    countFlairsAfter[r] = countFlairsAfter[r] + 1


countFlairsBefore = {k: v for k, v in countFlairsBefore.items() if v > 150}
countFlairsAfter = {k: v for k, v in countFlairsAfter.items() if v > 800}

pl.barh(range(len(countFlairsBefore)), list(countFlairsBefore.values()), tick_label=list(countFlairsBefore.keys()))
pl.title('GME post category January 1st till 22nd')

In [None]:
pl.barh(range(len(countFlairsAfter)), list(countFlairsAfter.values()), tick_label=list(countFlairsAfter.keys()), color='orange')
pl.title('GME post category January 23rd till 30th')

## VADER, textblob sentiment. VADER below.

In [None]:
#pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
def sentiment_classifier(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    print("Overall sentiment dictionary is : ", sentiment_dict)
    print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
    print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        print("Positive")
 
    elif sentiment_dict['compound'] <= - 0.05 :
        print("Negative")
 
    else :
        print("Neutral")
        
def sentiment_scores(sentence):
 
    sid_obj = SentimentIntensityAnalyzer()
 
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    return sentiment_dict['compound']

In [None]:
sentence = "I love the TU/e. It may have it's downsides but generally it's pretty swell."

sentiment_classifier(sentence)

In [None]:
redditpostsRelevant = redditPosts.query('link_flair_text == "Discussion" or link_flair_text == "News" or link_flair_text == "Loss" or link_flair_text == "Gain"')

redditpostsRelevant['text_sentiment_score'] = redditpostsRelevant['title'].apply(sentiment_scores)
#takes extremely long, might want to include some kind of progress printing and do it on a pc.
redditpostsRelevant

#also for now there is only title implementation as selftext might be missing (in the case of memes for example)
#but that might actuall©y be an advantage, filtering out memes and removed posts in order to make scoring even more accurate.

In [None]:
# redditpostsRelevant.to_csv('redditpostsRevelant.csv')
#imagine overwriting this file...

# Getting short interest data

In [None]:
short_interest = pd.read_csv('short_interestFIZZ&LGND&AAPL&GME&AMC.csv')
short_interest = short_interest.drop(columns=['Unnamed: 0', 'Unnamed: 4'])

unique_vals = short_interest['Instrument'].unique()


for unique_instr in unique_vals:
    print("\n" + unique_instr)
    print(short_interest.groupby(short_interest.Instrument).get_group(unique_instr))
    
GME_shorts = short_interest.groupby(short_interest.Instrument).get_group("GME.N")

GME_shorts["Date"] = pd.to_datetime(GME_shorts['Date'], format='%Y-%m-%dT%H:%M:%SZ')

In [None]:
# GME_shorts = GME_shorts.rename({'Date': 'ds'}, axis='columns')
GME_shorts.set_index('ds', inplace=True)
GME_shorts

In [None]:
GME_shorts = GME_shorts.drop(columns=['Instrument'])
#get rid of this fing colum bro

## Time-series test

In [None]:
#!pip install prophet

In [None]:
prices_prophet = prices_hour[['GME']]
prices_prophet.reset_index(level=0, inplace=True)
prices_prophet = prices_prophet.rename({'index': 'ds', 'GME': 'y'}, axis='columns')
prices_prophet

In [None]:
prices_prophet = prices_prophet.dropna()
prices_prophet

In [None]:
merge_short_n_price = pd.merge_asof(prices_prophet, GME_shorts, on='ds', direction='backward')
#use backward direction to prevent bias.
merge_short_n_price

In [None]:
#This typo has been here for way too long
sentiment_values = pd.read_csv('redditpostsRevelant.csv')

sentiment_values = sentiment_values[['created_utc', 'text_sentiment_score']]

sentiment_values['created_utc'] = pd.to_datetime(sentiment_values['created_utc'])

In [None]:
sentiment_values = sentiment_values.rename({'created_utc':'ds'}, axis='columns')

sentiment_values = sentiment_values.set_index('ds')
sentiment_values.sort_index(inplace=True)

sentiment_values
#aggregate troll

In [None]:
sentiment_values = sentiment_values[['text_sentiment_score']].rolling(window = '1D').sum()

sentiment_values

#NOW MERGE!

In [None]:
merged_all = pd.merge_asof(merge_short_n_price, sentiment_values, on='ds', direction='backward')
merged_all
#In case of gme the sentiment only really goes up till 06-24 so yknow

In [None]:
partial_data = merged_all.set_index('ds')
partial_data = partial_data['2020-09-01':'2021-01-10']

In [None]:
partial_data = partial_data.reset_index()
partial_data

In [None]:
from prophet import Prophet
prophet_model = Prophet(daily_seasonality=True)

In [None]:
prophet_model.add_regressor('Short Interest Pct')
prophet_model.add_regressor('text_sentiment_score')

prophet_model.fit(partial_data)

In [None]:
future = prophet_model.make_future_dataframe(periods=60)
future['Short Interest Pct'] = merged_all['Short Interest Pct']
future['text_sentiment_score'] = merged_all['text_sentiment_score']
forecast = prophet_model.predict(future)
figure1 = prophet_model.plot(forecast)
figure2 = prophet_model.plot_components(forecast)

In [None]:
#looks KIND OF promising????!?!??!???!??!?!?