# Imports

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import os

# Gather Data

In [2]:
twitter = pd.read_json(r"C:\Programming\Python\Programs_NLP\MLFinalProject\src\data\twitter_data_with_sentiment.json")

In [3]:
twitter.timestamp = pd.to_datetime(twitter.timestamp)

In [7]:
stock_dict = {}
stock_dir = '../src/data/CompleteStockData'

In [12]:
for file in os.listdir('../src/data/CompleteStockData'):
    if file.endswith('.csv'):
        symbol = file.replace('_stocks.csv','')
        stock_dict[symbol] = pd.read_csv(os.sep.join([stock_dir,file]), index_col='date')

In [39]:
for key in stock_dict.keys():
    stock_dict[key].index = pd.to_datetime(stock_dict[key].index)

In [40]:
twitter.head()

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,sentiment_compound,sentiment_neg,sentiment_pos,sentiment_neu
0,1019696670777503700,VIDEO: “I was in my office. I was minding my o...,2018-07-18 21:33:26,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,0.0,0.0,0.0,1.0
1,1019709091038548000,The price of lumber $LB_F is down 22% since hi...,2018-07-18 22:22:47,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,0.0,0.0,0.0,1.0
10,1019723137481617400,U.S. proposes expedited appeal in fight with A...,2018-07-18 23:18:36,Reuters,TWX,Time Warner,https://reut.rs/2O1Ao46,True,-0.3818,0.167,0.0,0.833
100,1016200281749643300,Short sale volume(not short interest) for $D o...,2018-07-09 06:00:01,shortvolumes,AEE,Ameren Corporation,http://shortvolumes.com/?t=D,False,0.0,0.0,0.0,1.0
1000,1016656436762329100,$MO $SKT $TGT $PNR $ED $T $KMB $BEN $PEP $ORI ...,2018-07-10 12:12:37,SeekingAlpha,ED,Consolidated Edison,https://seekingalpha.com/article/4186312-high-...,False,0.0,0.0,0.0,1.0


In [41]:
output_df = twitter[twitter.symbols.apply(lambda sym: sym.lower() in stock_dict)][
    ['id', 'timestamp', 'symbols','sentiment_compound', 'sentiment_neg', 'sentiment_pos',
       'sentiment_neu']].copy()

In [58]:
def get_stock_days(sym, ts, output_price='4. close', perc_change=False):
    sym = sym.lower()
    
    before_close = False
    if ts.hour < 16:
        before_close = True
    
    if before_close:
        before_df = stock_dict[sym].loc[:ts + pd.Timedelta(days=1)]
        after_df = stock_dict[sym].loc[ts + pd.Timedelta(days=1):]
    else:
        before_df = stock_dict[sym].loc[:ts ]
        after_df = stock_dict[sym].loc[ts:]
    
    if not perc_change:
        output_df = pd.concat([before_df.iloc[-3:],after_df.iloc[:4]])
    else:
        output_df = pd.concat([before_df.iloc[-4:],after_df.iloc[:4]])
        output_df[output_price] = output_df[output_price].pct_change()
        output_df = output_df[1:]
    return {'t{}'.format(str(idx)):val for idx,val in enumerate(output_df[output_price], -3)}

In [59]:
get_stock_days('gs', twitter.iloc[0].timestamp, perc_change=True)

{'t-3': 0.022216333200830274,
 't-2': -0.0018147251987555846,
 't-1': 0.0009522985022940578,
 't0': -0.006962463241653727,
 't1': 0.008970953272656068,
 't2': 0.008934351935776297,
 't3': 0.009069130732375141}

In [51]:
output_df['price_data_dict'] = output_df.apply(lambda row: get_stock_days(row['symbols'], row['timestamp']), axis=1)

In [60]:
output_df['price_data_dict_perc'] = output_df.apply(lambda row: get_stock_days(row['symbols'], row['timestamp'], perc_change=True), axis=1)

In [55]:
for time_slice in ['t{}'.format(str(idx)) for idx in range(-3,4)]:
    output_df[time_slice] = output_df.price_data_dict.apply(lambda d: d.get(time_slice))

In [61]:
for time_slice in ['t{}_perc'.format(str(idx)) for idx in range(-3,4)]:
    output_df[time_slice] = output_df.price_data_dict_perc.apply(lambda d: d.get(time_slice))

In [57]:
output_df.drop('price_data_dict', axis=1).to_csv(r'C:\Programming\Python\Programs_NLP\MLFinalProject\src\data\twitter_features.csv')

In [62]:
output_df

Unnamed: 0,id,timestamp,symbols,sentiment_compound,sentiment_neg,sentiment_pos,sentiment_neu,price_data_dict,t-3,t-2,...,t2,t3,price_data_dict_perc,t-3_perc,t-2_perc,t-1_perc,t0_perc,t1_perc,t2_perc,t3_perc
0,1019696670777503700,2018-07-18 21:33:26,GS,0.0000,0.000,0.000,1.000,"{'t-3': 231.44, 't-2': 231.02, 't-1': 231.24, ...",231.44,231.02,...,233.76,235.88,"{'t-3': 0.022216333200830274, 't-2': -0.001814...",,,,,,,
1,1019709091038548000,2018-07-18 22:22:47,M,0.0000,0.000,0.000,1.000,"{'t-3': 36.99, 't-2': 37.07, 't-1': 37.7, 't0'...",36.99,37.07,...,39.41,39.36,"{'t-3': 0.016488046166529324, 't-2': 0.0021627...",,,,,,,
10,1019723137481617400,2018-07-18 23:18:36,TWX,-0.3818,0.167,0.000,0.833,"{'t-3': 97.95, 't-2': 98.77, 't-1': 98.77, 't0...",97.95,98.77,...,,,"{'t-3': 0.017979630014550052, 't-2': 0.0083716...",,,,,,,
100,1016200281749643300,2018-07-09 06:00:01,AEE,0.0000,0.000,0.000,1.000,"{'t-3': 62.03, 't-2': 59.97, 't-1': 60.5, 't0'...",62.03,59.97,...,61.39,61.34,"{'t-3': 0.007798537774167391, 't-2': -0.033209...",,,,,,,
1000,1016656436762329100,2018-07-10 12:12:37,ED,0.0000,0.000,0.000,1.000,"{'t-3': 76.86, 't-2': 77.86, 't-1': 79.03, 't0...",76.86,77.86,...,79.20,78.96,"{'t-3': -0.033207547169811336, 't-2': 0.013010...",,,,,,,
10000,1018861362213195800,2018-07-16 14:14:12,HAS,0.5423,0.137,0.312,0.551,"{'t-3': 96.52, 't-2': 94.35, 't-1': 94.02, 't0...",96.52,94.35,...,93.93,106.04,"{'t-3': -0.009034907597536002, 't-2': -0.02248...",,,,,,,
10001,1018861387299242000,2018-07-16 14:14:18,BEN,0.4588,0.000,0.130,0.870,"{'t-3': 32.33, 't-2': 32.1, 't-1': 32.11, 't0'...",32.33,32.10,...,31.98,32.48,"{'t-3': -0.006148170919151652, 't-2': -0.00711...",,,,,,,
10002,1018861392009580500,2018-07-16 14:14:20,LH,-0.3612,0.200,0.000,0.800,"{'t-3': 187.21, 't-2': 186.44, 't-1': 184.85, ...",187.21,186.44,...,186.94,188.27,"{'t-3': 0.0065053763440861, 't-2': -0.00411302...",,,,,,,
10003,1018861455075094500,2018-07-16 14:14:35,EFX,0.0000,0.000,0.000,1.000,"{'t-3': 126.99, 't-2': 126.64, 't-1': 125.88, ...",126.99,126.64,...,126.51,127.08,"{'t-3': -0.012135355892648914, 't-2': -0.00275...",,,,,,,
10004,1018861455985131500,2018-07-16 14:14:35,CCL,0.0000,0.000,0.000,1.000,"{'t-3': 58.05, 't-2': 58.4, 't-1': 58.59, 't0'...",58.05,58.40,...,58.14,58.15,"{'t-3': -0.0059931506849315586, 't-2': 0.00602...",,,,,,,
