# Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import os

# Gather Data

In [2]:
twitter = pd.read_json(r"C:\Programming\Python\Programs_NLP\MLFinalProject\src\data\twitter_data_with_sentiment.json")

In [3]:
twitter.timestamp = pd.to_datetime(twitter.timestamp)

In [202]:
stock_dict = {}
stock_dir = '../src/data/CompleteStockData'

In [203]:
for file in os.listdir('../src/data/CompleteStockData'):
    if file.endswith('.csv'):
        symbol = file.replace('_stocks.csv','')
        stock_dict[symbol] = pd.read_csv(os.sep.join([stock_dir,file]), index_col='date')

In [204]:
for key in stock_dict.keys():
    stock_dict[key].index = pd.to_datetime(stock_dict[key].index)

In [205]:
twitter.head(2)

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,sentiment_compound,sentiment_neg,sentiment_pos,sentiment_neu
0,1019696670777503700,VIDEO: “I was in my office. I was minding my o...,2018-07-18 21:33:26,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,0.0,0.0,0.0,1.0
1,1019709091038548000,The price of lumber $LB_F is down 22% since hi...,2018-07-18 22:22:47,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,0.0,0.0,0.0,1.0


In [206]:
output_df = twitter[twitter.symbols.apply(lambda sym: sym.lower() in stock_dict)][
    ['id', 'timestamp', 'symbols','sentiment_compound', 'sentiment_neg', 'sentiment_pos',
       'sentiment_neu']].copy()

In [236]:
def get_stock_days(sym, ts, output_price='4. close', perc_change=False, close_override=None):
    sym = sym.lower()
    
    if close_override is None:
        before_close = False
        if ts.hour < 16:
            before_close = True
    else:
        before_close = close_override
    
    if before_close:
        location = np.where(stock_dict[sym].index.map(pd.datetime.date)==(ts.date()+pd.Timedelta(days=1)))[0]
    else:
        location = np.where(stock_dict[sym].index.map(pd.datetime.date)==ts.date())[0]
    if len(location)==0:
        return {}
    location = location[0]
    


    if not perc_change:
        res_df = stock_dict[sym].iloc[location-3:location+4].copy()
    else:
        res_df = stock_dict[sym].iloc[location-4:location+4].copy()
        res_df[output_price] = res_df[output_price].pct_change()
        res_df = res_df.iloc[1:]
    return {'t{}'.format(str(idx)):val for idx,val in enumerate(res_df[output_price], -3)}

In [237]:
output_df['price_data_dict'] = output_df.apply(lambda row: get_stock_days(row['symbols'], row['timestamp']), axis=1)

In [238]:
output_df['price_data_dict_perc'] = output_df.apply(lambda row: get_stock_days(row['symbols'], row['timestamp'], perc_change=True), axis=1)

In [239]:
for time_slice in ['t{}'.format(str(idx)) for idx in range(-3,4)]:
    output_df[time_slice] = output_df.price_data_dict.apply(lambda d: d.get(time_slice))

In [240]:
for time_slice in ['t{}_perc'.format(str(idx)) for idx in range(-3,4)]:
    output_df[time_slice] = output_df.price_data_dict_perc.apply(lambda d: d.get(time_slice.replace('_perc','')))

In [247]:
output_df.drop(['price_data_dict','price_data_dict_perc'], axis=1).to_csv(r'C:\Programming\Python\Programs_NLP\MLFinalProject\src\data\twitter_features.csv')

# Grouping by symbol + day

In [259]:
twitter.columns

Index(['id', 'text', 'timestamp', 'source', 'symbols', 'company_names', 'url',
       'verified', 'sentiment_compound', 'sentiment_neg', 'sentiment_pos',
       'sentiment_neu'],
      dtype='object')

In [260]:
grouped_twitter = twitter[twitter.symbols.apply(lambda sym: sym.lower() in stock_dict)][
    ['timestamp','sentiment_neg', 'sentiment_pos',
       'sentiment_neu','symbols']].assign(timestamp=twitter.timestamp.apply(pd.datetime.date)).copy().groupby(['symbols','timestamp']).sum().reset_index()

In [264]:
grouped_twitter['price_data_dict_perc'] = grouped_twitter.apply(lambda row: get_stock_days(row['symbols'], 
                                                                                           pd.to_datetime(row['timestamp']), 
                                                                                           perc_change=True,
                                                                                          close_override=False), axis=1)

In [265]:
for time_slice in ['t{}_perc'.format(str(idx)) for idx in range(-3,4)]:
    grouped_twitter[time_slice] = grouped_twitter.price_data_dict_perc.apply(lambda d: d.get(time_slice.replace('_perc','')))

In [266]:
grouped_twitter['sentiment_simple_agg'] = grouped_twitter.sentiment_pos - grouped_twitter.sentiment_neg

# DOJ Data

In [280]:
doj = pd.read_json(r"C:\Programming\Python\Programs_NLP\MLFinalProject\src\data\doj_data_with_tags_and_industries_and_sentiment.json")

In [281]:
doj.head()

Unnamed: 0,all_orgs,clean_orgs,components,contents,date,id,industries,organizations,organizations_titles,sectors,tagged_companies,tagged_symbols,title,topics,sentiment_compound,sentiment_neg,sentiment_neu,sentiment_pos
100,"[Stanley Black & Decker Inc., Honeywell Intern...","[stanley black & decker, honeywell internation...",[Environment and Natural Resources Division],Another important step toward cleaning u...,2012-11-19 05:00:00,12-1384,"[Industrial Machinery/Components, Auto Parts:O...","[Stanley Black & Decker Inc., Honeywell Intern...",[],"[Capital Goods, Capital Goods, Energy, Energy,...","[STANLEY BLACK & DECKER INC, HONEYWELL INTERNA...","[swk, hon, xom, hes, bwa, txt, utx]",Agreement Furthers Cleanup of the Quanta Resou...,[],0.986,0.043,0.865,0.092
10000,"[Goldstar Property Management, , North ...","[goldstar property management, , north star, t...",[Tax Division],"Troy A. Beam of Shippensburg, Pa., was sen...",2012-04-10 04:00:00,12-453,[],"[Goldstar Property Management, , North ...",[],[],[BEAM INC],[beam],Pennsylvania Tax Defier Sentenced to More Than...,[],-0.9502,0.091,0.849,0.06
10007,"[the Bureau of Alcohol, Tobacco and Firearms, ...","[the bureau of alcohol, tobacco and firearms, ...","[Civil Rights Division, Civil Rights - Crimina...","WASHINGTON– Bobby Joe Rogers, 41, of Pensacola...",2012-02-23 05:00:00,12-247,[Major Chemicals],"[the Bureau of Alcohol, Tobacco and Firearms, ...",[American Family Planning Clinic],[Basic Industries],[ROGERS CORP],[rog],"Pensacola, Florida, Man Indicted for Arson at ...",[],-0.9588,0.169,0.734,0.097
10020,"[the Federal Food Drug and Cosmetic Act, Proto...","[the federal food drug and cosmetic act, proto...",[Civil Division],Pfizer Inc. will pay $55 million plus interest...,2012-12-12 05:00:00,12-1488,"[Major Pharmaceuticals, Major Pharmaceuticals]","[the Federal Food Drug and Cosmetic Act, Proto...","[Off-Label Use, IllegallyPromoting Protonix]","[Health Care, Health Care]","[PFIZER INC, WYETH, WYETH, PFIZER INC]","[pfe, wye, wye, pfe]",Pfizer Agrees to Pay $55 Million for Illegally...,[Consumer Protection],0.9931,0.069,0.814,0.117
10021,"[The Department of Justice, H.C.P., Pfizer H.C...","[the department of justice, h.c.p, pfizer h.c....",[Criminal Division],"WASHINGTON – Pfizer H.C.P. Corporation, an i...",2012-08-07 04:00:00,12-980,[Major Pharmaceuticals],"[The Department of Justice, H.C.P., Pfizer H.C...",[Pfizer H.C.P. Corp.],[Health Care],"[PFIZER INC, WYETH, WYETH]","[pfe, wye, wye]",Pfizer H.C.P. Corp. Agrees to Pay $15 Million ...,[],0.9954,0.059,0.806,0.135


In [304]:
doj_expanded = {c:[] for c in ['date','id','sentiment_compound', 'sentiment_neg', 'sentiment_pos', 'sentiment_neu','symbol']}

In [305]:
for _,date,doc_id,sentiment_compound, sentiment_neg, sentiment_pos, sentiment_neu, symbols in doj[
    ['date','id','sentiment_compound', 'sentiment_neg', 'sentiment_pos','sentiment_neu', 'tagged_symbols']].itertuples():
    
    for sym in symbols:
        if sym in stock_dict:
            
            doj_expanded['id'].append(doc_id)
            doj_expanded['date'].append(date)
            doj_expanded['sentiment_compound'].append(sentiment_compound)
            doj_expanded['sentiment_neg'].append(sentiment_neg)
            doj_expanded['sentiment_pos'].append(sentiment_pos)
            doj_expanded['sentiment_neu'].append(sentiment_neu)
            doj_expanded['symbol'].append(sym)
            
#             doj_expanded['price_data'] = get_stock_days(sym, date, perc_change=False)
#             doj_expanded['price_data_perc'] = get_stock_days(sym, date, perc_change=True)
                        
    

In [306]:
doj_expanded_df = pd.DataFrame.from_dict(doj_expanded)

In [307]:
doj_expanded_df['price_data_dict'] = doj_expanded_df.apply(lambda row: get_stock_days(row['symbol'], row['date']), axis=1)
doj_expanded_df['price_data_dict_perc'] = doj_expanded_df.apply(lambda row: get_stock_days(row['symbol'], row['date'], perc_change=True), axis=1)

In [308]:
for time_slice in ['t{}_perc'.format(str(idx)) for idx in range(-3,4)]:
    doj_expanded_df[time_slice] = doj_expanded_df.price_data_dict_perc.apply(lambda d: d.get(time_slice.replace('_perc','')))

In [309]:
doj_expanded_df.drop('price_data_dict_perc'], axis=1).to_csv(r'C:\Programming\Python\Programs_NLP\MLFinalProject\src\data\twitter_features.csv')

Unnamed: 0,date,id,sentiment_compound,sentiment_neg,sentiment_pos,sentiment_neu,symbol,price_data_dict,price_data_dict_perc,t-3_perc,t-2_perc,t-1_perc,t0_perc,t1_perc,t2_perc,t3_perc
0,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,swk,"{'t-3': 66.77, 't-2': 67.41, 't-1': 69.86, 't0...","{'t-3': -0.006546644844517169, 't-2': 0.009585...",-0.006547,0.009585,0.036345,0.002863,-0.000428,0.019420,0.000420
1,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,hon,"{'t-3': 59.33, 't-2': 59.15, 't-1': 60.44, 't0...","{'t-3': -0.01050700466977994, 't-2': -0.003033...",-0.010507,-0.003034,0.021809,0.002813,-0.000330,0.011058,-0.009794
2,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,xom,"{'t-3': 86.14, 't-2': 86.45, 't-1': 87.67, 't0...","{'t-3': 0.0008132915069130053, 't-2': 0.003598...",0.000813,0.003599,0.014112,-0.001939,0.005829,0.012271,-0.005276
3,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,hes,"{'t-3': 48.89, 't-2': 48.91, 't-1': 50.49, 't0...","{'t-3': -0.012921461740359352, 't-2': 0.000409...",-0.012921,0.000409,0.032304,-0.018618,0.011100,0.020359,-0.023865
4,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,bwa,"{'t-3': 61.93, 't-2': 61.75, 't-1': 63.49, 't0...","{'t-3': 0.008139345596614111, 't-2': -0.002906...",0.008139,-0.002907,0.028178,0.000473,0.003149,0.024796,-0.007963
5,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,txt,"{'t-3': 23.14, 't-2': 23.12, 't-1': 23.57, 't0...","{'t-3': -0.002586206896551646, 't-2': -0.00086...",-0.002586,-0.000864,0.019464,-0.000424,0.001273,0.008054,0.001262
6,2012-11-19 05:00:00,12-1384,0.9860,0.043,0.092,0.865,utx,"{'t-3': 74.84, 't-2': 75.25, 't-1': 76.58, 't0...","{'t-3': 0.0025452109845947035, 't-2': 0.005478...",0.002545,0.005478,0.017674,0.001436,0.008345,0.016552,0.000890
7,2012-12-12 05:00:00,12-1488,0.9931,0.069,0.117,0.814,pfe,"{'t-3': 25.41, 't-2': 25.64, 't-1': 25.51, 't0...","{'t-3': -0.005868544600938885, 't-2': 0.009051...",-0.005869,0.009052,-0.005070,-0.007056,-0.005922,0.006950,0.011240
8,2012-12-12 05:00:00,12-1488,0.9931,0.069,0.117,0.814,pfe,"{'t-3': 25.41, 't-2': 25.64, 't-1': 25.51, 't0...","{'t-3': -0.005868544600938885, 't-2': 0.009051...",-0.005869,0.009052,-0.005070,-0.007056,-0.005922,0.006950,0.011240
9,2012-08-07 04:00:00,12-980,0.9954,0.059,0.135,0.806,pfe,"{'t-3': 24.28, 't-2': 24.26, 't-1': 23.74, 't0...","{'t-3': 0.015050167224080147, 't-2': -0.000823...",0.015050,-0.000824,-0.021434,0.003791,0.001679,0.002933,-0.009190


# Some regressions

In [248]:
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [254]:
X = output_df[output_df.sentiment_pos>0].dropna()[['sentiment_pos','t-3_perc','t-2_perc','t-1_perc']]
y = output_df[output_df.sentiment_pos>0].dropna().t3_perc

In [255]:
X = sm.add_constant(X)

  return ptp(axis=axis, out=out, **kwargs)


In [256]:
model = sm.OLS(y,X)
results = model.fit()

In [257]:
results.summary()

0,1,2,3
Dep. Variable:,t3_perc,R-squared:,0.027
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,66.37
Date:,"Thu, 14 Mar 2019",Prob (F-statistic):,1.8e-55
Time:,22:47:35,Log-Likelihood:,26580.0
No. Observations:,9499,AIC:,-53150.0
Df Residuals:,9494,BIC:,-53110.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0003,0.000,0.959,0.337,-0.000,0.001
sentiment_pos,-0.0041,0.002,-2.427,0.015,-0.007,-0.001
t-3_perc,-0.0992,0.010,-10.421,0.000,-0.118,-0.081
t-2_perc,0.0378,0.010,3.656,0.000,0.018,0.058
t-1_perc,0.1198,0.010,12.145,0.000,0.100,0.139

0,1,2,3
Omnibus:,1979.332,Durbin-Watson:,1.848
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34462.52
Skew:,-0.527,Prob(JB):,0.0
Kurtosis:,12.272,Cond. No.,70.0


In [258]:
X.shape

(9499, 5)

In [272]:
X = grouped_twitter.dropna()[['sentiment_simple_agg','t-3_perc','t-2_perc','t-1_perc']]
y = grouped_twitter.dropna().t3_perc

In [273]:
X = sm.add_constant(X)

In [274]:
model = sm.OLS(y,X)
results = model.fit()

In [275]:
results.summary()

0,1,2,3
Dep. Variable:,t3_perc,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.983
Date:,"Thu, 14 Mar 2019",Prob (F-statistic):,0.416
Time:,22:51:41,Log-Likelihood:,5031.1
No. Observations:,1741,AIC:,-10050.0
Df Residuals:,1736,BIC:,-10020.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0006,0.000,-1.542,0.123,-0.001,0.000
sentiment_simple_agg,7.509e-05,0.000,0.381,0.703,-0.000,0.000
t-3_perc,-0.0201,0.022,-0.896,0.370,-0.064,0.024
t-2_perc,-0.0068,0.023,-0.298,0.766,-0.051,0.038
t-1_perc,0.0371,0.023,1.632,0.103,-0.008,0.082

0,1,2,3
Omnibus:,315.089,Durbin-Watson:,2.095
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7277.646
Skew:,0.007,Prob(JB):,0.0
Kurtosis:,13.016,Cond. No.,138.0


In [276]:
grouped_twitter

Unnamed: 0,symbols,timestamp,sentiment_neg,sentiment_pos,sentiment_neu,price_data_dict_perc,t-3_perc,t-2_perc,t-1_perc,t0_perc,t1_perc,t2_perc,t3_perc,sentiment_simple_agg
0,A,2018-07-16,0.000,0.000,1.000,"{'t-3': -0.012721847023716037, 't-2': 0.009385...",-0.012722,0.009386,-0.001103,-0.010413,0.005102,-0.000793,0.018416,0.000
1,A,2018-07-17,0.264,1.917,44.819,"{'t-3': 0.009385937002863498, 't-2': -0.001103...",0.009386,-0.001103,-0.010413,0.005102,-0.000793,0.018416,-0.004677,1.653
2,A,2018-07-18,0.298,1.672,36.030,"{'t-3': -0.0011032308904649346, 't-2': -0.0104...",-0.001103,-0.010413,0.005102,-0.000793,0.018416,-0.004677,-0.002036,1.374
3,AABA,2018-07-11,0.747,0.590,11.662,"{'t-3': 0.02740286298568506, 't-2': 0.00331740...",0.027403,0.003317,0.004761,-0.029090,0.011524,-0.001742,0.002417,-0.157
4,AABA,2018-07-12,0.309,1.063,12.629,"{'t-3': 0.003317409766454338, 't-2': 0.0047612...",0.003317,0.004761,-0.029090,0.011524,-0.001742,0.002417,0.009376,0.754
5,AABA,2018-07-13,0.000,0.488,3.512,"{'t-3': 0.004761274963629214, 't-2': -0.029090...",0.004761,-0.029090,0.011524,-0.001742,0.002417,0.009376,-0.009289,0.488
6,AABA,2018-07-14,0.318,0.000,0.682,{},,,,,,,,-0.318
7,AABA,2018-07-15,1.118,0.817,9.065,{},,,,,,,,-0.301
8,AABA,2018-07-16,0.000,0.370,4.630,"{'t-3': -0.029090430433065584, 't-2': 0.011523...",-0.029090,0.011524,-0.001742,0.002417,0.009376,-0.009289,-0.017412,0.370
9,AABA,2018-07-17,0.000,0.200,3.800,"{'t-3': 0.011523861171366612, 't-2': -0.001742...",0.011524,-0.001742,0.002417,0.009376,-0.009289,-0.017412,-0.001227,0.200
