In [1]:
# Ignore this cell
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/datasets/finance_social/Dataset')
os.listdir()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['StockTwits',
 'Twitter',
 'ReutersNews',
 'twiters_label_data_Feb-02-2020.xlsx',
 'Stocktwits Label Model training with twiter data.ipynb',
 'github data',
 'INTENTS and UTTERANCE.yaml',
 'Keras_FNN_model.json',
 'finalized_Keras_model.sav',
 'Stocktwits Label Model training with twiter data - Final.ipynb',
 'Keras KNN test result 1.PNG',
 'Keras KNN test result 1a.PNG',
 'finalized_Keras_model_1.sav',
 'Keras_FNN_model_1.json',
 'Keras_FNN_model.h5',
 'Stocktwits Keras FNN for sentiment analysis-Final.ipynb',
 'finalized_SVM_model.sav',
 'chatbot_training.yaml',
 'stocktwits_sentiment_subjectivity_Feb-01-2020_p.xlsx',
 'twitter_ul.npy']

In [0]:
import pandas_datareader.data as web
import pandas as pd
import numpy as np
from datetime import date,timedelta,datetime
from pathlib import Path

In [0]:
symbol = 'WIKI/AAPL'  # or 'AAPL.US'
QUANDL_API_KEY='ZW4bXVvkNyY761t7p8X1'
AV_API_KEY='S4EB976SITNHMRCF'
T_API_KEY='9bc45049f62958f6687a8fb658a27f8b25ebc22e'
stocktwits_file=Path('StockTwits/stocktwits_labelled_v1.pkl')
twitter_file = Path('Twitter/twitter_labelled_v1.pkl')

In [0]:
def fetch_returns(symbol,start_date,end_date,col='close'):
  '''
  returns closing price(or passed col) and financial market returns for symbol between start_date and end_date for symbol from TIINGO
  '''
  try:
    df = web.DataReader(symbol, 'tiingo', start_date, end_date,access_key=T_API_KEY).sort_index()
    df = df.loc[:,[col]]
    df['1_day_return'] = df[col].pct_change(periods=1)
    df['3_day_return'] = df[col].pct_change(periods=3)
    df['5_day_return'] = df[col].pct_change(periods=5)
    df.index = df.index.droplevel()
    return df
  except Exception as e:
    print(symbol, start_date, end_date, e)

In [0]:
def parse_date(date_str, format='%Y-%m-%d %H:%M:%S'):
  '''
  returns datetime.date object parsed from date_str for the given format
  '''
  return_date = date_str.replace('T',' ').replace('Z','')
  return_date = datetime.strptime(return_date,format).date()
  return return_date

In [0]:
def find_stop_date(data, symbol, datecol, tickercol):
  '''
  return earliest date for the symbol from data
  '''
    symbol = symbol.split('.')[0]
    dates = data.loc[data[tickercol] == symbol, datecol]
    return dates.min()

In [0]:
def get_return_labels(returns,fdate,ticker=None,verbose=False):
  '''
  fetch return labels for 1, 3 and 5 days from fdate
  '''
  # dict for return period: col position in 'returns'
  periods = {1: 1,
             3: 2,
             5: 3}
  suffix = '_day_return'
  colnames = [str(p) + suffix for p in periods]
  try:
    while fdate not in returns.index:
      if verbose:
        print(f'{fdate}')
      fdate = fdate + timedelta(days=1)
      if verbose:
        print(f'{fdate} instead')

    dateidx = returns.index.get_loc(fdate)
    get_return = lambda p, c: returns.iat[dateidx + p, c] if dateidx + p < len(returns) else np.NaN
    return_labels = [get_return(per, col) for per, col in periods.items()]
    return pd.Series(return_labels,index=colnames)
  except Exception as e:
    print(ticker, returns.index.max(), returns.index.min(),fdate,e)

In [0]:
def label_returns(data, datecol, tickercol):
  '''
  returns DataFrame with return columns 1_day_return, 3_day_return, 5_day_return indicating future returns from row[datecol]for stock
    with symbol row[tickercol]. Cell value set to NaN if future return date exceeds last working day for financial markets. 
  '''
  symbols = data[tickercol].unique()
  enddate= min(data[datecol].max() + timedelta(days = 10), date.today()-timedelta(days=2))  # min(max date+10, day befor yesterday)
  startdates= {s:find_stop_date(data,s,datecol,tickercol) - timedelta(days=10)  for s in symbols}
  returns = {s: fetch_returns(s,startdates[s],enddate) for s in symbols if s not in ['VIX']}
  returns['VIX'] = fetch_returns('VIXY',startdates['VIX'],enddate)
  data_labels = data.apply(lambda row: get_return_labels(returns[row[tickercol]],row[datecol],row[tickercol]), axis=1)
  label_cols = data_labels.columns
  data_labelled = data.copy()
  data_labelled[label_cols] = data_labels
  return data_labelled

# Fetch return labels for Stocktwits

In [0]:
data = pd.read_pickle(stocktwits_file)
data= data.loc[data['ticker']!= 'YHOO',:]  # YHOO not publicly traded since 2017, tweets not relevant to stock.
data.columns

Index(['id', 'body', 'created_at', 'user', 'source', 'symbols',
       'mentioned_users', 'entities', 'filters', 'conversation', 'likes',
       'links', 'reshare_message', 'reshares', 'structurable', 'ticker',
       'user_followers', 'user_following', 'user_join_date', 'user_ideas',
       'user_identity', 'user_like_count', 'user_official',
       'user_wtchlst_count', 'username', 'sentiment', 'num_likes',
       'num_reshares', 'num_replies', 'day_counts', 'raw_body', 'char_length',
       'bearish_score', 'bullish_score', 'sentiment_pred'],
      dtype='object')

In [0]:
data_labelled = label_returns(data, datecol='created_at', tickercol='ticker')

In [0]:
data_labelled.to_pickle(Path('StockTwits/stocktwits_labelled_v3.pkl'))

# Fetch return labels for Twitter

In [14]:
data = pd.read_pickle(twitter_file)
data['ticker'] = data['ticker'].apply(lambda t: t.upper().replace('$', '').strip())
data['Date'] = data['Date'].apply(lambda d: d.date())
data.columns

Index(['Date', 'Text', 'favorite_count', 'retweet_count', 'ticker',
       'bearish_score', 'bullish_score', 'sentiment_pred'],
      dtype='object')

In [0]:
data_labelled = label_returns(data, datecol='Date', tickercol='ticker')

In [0]:
data_labelled.to_pickle(Path('Twitter/twitter_labelled_v3.pkl'))

# Rough work

In [0]:
returns = fetch_returns('AAPL', date(2020,1,1), date(2020, 1,13))
returns

Unnamed: 0_level_0,close,1_day_return,3_day_return,5_day_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02 00:00:00+00:00,300.35,,,
2020-01-03 00:00:00+00:00,297.43,-0.009722,,
2020-01-06 00:00:00+00:00,299.8,0.007968,,
2020-01-07 00:00:00+00:00,298.39,-0.004703,-0.006526,
2020-01-08 00:00:00+00:00,303.19,0.016086,0.019366,
2020-01-09 00:00:00+00:00,309.63,0.021241,0.032789,0.030897
2020-01-10 00:00:00+00:00,310.33,0.002261,0.040015,0.043372
2020-01-13 00:00:00+00:00,316.96,0.021364,0.045417,0.057238


In [0]:
get_return_labels(returns, date(2020,1,6))

1_day_return   -0.004703
3_day_return    0.032789
5_day_return    0.057238
dtype: float64

In [17]:
data_labelled.isna().mean()

Date              0.000000
Text              0.000000
favorite_count    0.000000
retweet_count     0.000000
ticker            0.000000
bearish_score     0.000000
bullish_score     0.000000
sentiment_pred    0.000000
1_day_return      0.000000
3_day_return      0.000000
5_day_return      0.137736
dtype: float64

In [22]:
data_labelled.loc[data_labelled['5_day_return'].isna(),'Date'].unique()

array([datetime.date(2020, 3, 2), datetime.date(2020, 3, 1),
       datetime.date(2020, 2, 29)], dtype=object)