In [30]:
from project_helper import TweetData, IntradayData, FuturesCloseData
import pandas as pd
import numpy as np
from pytz import timezone
import datetime
from datetime import timedelta  
from tqdm import tqdm
import copy
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Intraday S&P500 futures feature

In [3]:
tweet_data = TweetData()
tweet_data.raw_tweets.head()

Unnamed: 0_level_0,tweets
timestamp,Unnamed: 1_level_1
2019-11-17 19:57:12-06:00,"""Tell Jennifer Williams whoever that is to rea..."
2019-11-17 19:56:02-06:00,"""https://t.co/I3lO117SVh"
2019-11-17 19:49:47-06:00,"""Paul Krugman of @nytimes has been wrong about..."
2019-11-17 19:47:32-06:00,"""Schiff is a Corrupt Politician! https://t.co/..."
2019-11-17 19:30:09-06:00,""".@SteveScalise blew the nasty &amp; obnoxious..."


In [3]:
md = IntradayData()
fin_data = md.get_data()

In [4]:
# Number of tweets with exactly the same timestamp
len(tweet_data.raw_tweets.index) - len(set(tweet_data.raw_tweets.index))

321

In [5]:
print(fin_data.shape)
fin_data.head()

(1040156, 2)


Unnamed: 0_level_0,Open,Close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-11-13 17:01:00-06:00,2183.0,2183.25
2016-11-13 17:02:00-06:00,2183.25,2182.0
2016-11-13 17:03:00-06:00,2182.0,2182.75
2016-11-13 17:04:00-06:00,2182.5,2182.5
2016-11-13 17:05:00-06:00,2182.75,2183.0


In [6]:
subset = (tweet_data.raw_tweets.index > fin_data.index[0]) & (tweet_data.raw_tweets.index < fin_data.index[-1])
sub_data = tweet_data.raw_tweets[subset]

In [7]:
sub_data.head()

Unnamed: 0_level_0,tweets
timestamp,Unnamed: 1_level_1
2019-11-08 03:08:53-06:00,"""https://t.co/z0I7wBsgTP"
2019-11-08 00:08:15-06:00,"""STATEMENT FROM PRESIDENT DONALD J. TRUMP http..."
2019-11-07 15:43:29-06:00,"""Stock Market up big today. A New Record. Enjoy!"
2019-11-07 15:41:53-06:00,"""The Radical Left Dems and LameStream Media ar..."
2019-11-07 15:27:57-06:00,"""The Amazon Washington Post and three lowlife ..."


In [8]:
ts_pre = [ time + datetime.timedelta(seconds = - time.second) for i, time in enumerate(sub_data.index)]
ts_post = [ time + datetime.timedelta(seconds = 60*1 - time.second) for i, time in enumerate(sub_data.index)]
ts_1min = [ time + datetime.timedelta(seconds = 60*2 - time.second) for i, time in enumerate(sub_data.index)]
ts_5min = [ time + datetime.timedelta(seconds = 60*6 - time.second) for i, time in enumerate(sub_data.index)]
ts_15min = [ time + datetime.timedelta(seconds = 60*16 - time.second) for i, time in enumerate(sub_data.index)]

In [9]:
min_dict = {}
for i, ts in enumerate(ts_post):
    try:
        min_dict[ts] = min_dict[ts] + sub_data.tweets.iloc[i]
    except KeyError:
        min_dict[ts] = sub_data.tweets.iloc[i]
        
data_min = pd.DataFrame(data = min_dict.values(), index = min_dict.keys())
data_min.columns = ['tweets']
data_min.index.name = 'timestamp'
data_min.head()

Unnamed: 0_level_0,tweets
timestamp,Unnamed: 1_level_1
2019-11-08 03:09:00-06:00,"""https://t.co/z0I7wBsgTP"
2019-11-08 00:09:00-06:00,"""STATEMENT FROM PRESIDENT DONALD J. TRUMP http..."
2019-11-07 15:44:00-06:00,"""Stock Market up big today. A New Record. Enjoy!"
2019-11-07 15:42:00-06:00,"""The Radical Left Dems and LameStream Media ar..."
2019-11-07 15:28:00-06:00,"""The Amazon Washington Post and three lowlife ..."


In [10]:
ts_dict = {a:b for a, b in zip(ts_post, sub_data.index)}

In [11]:
for ret, ts in zip(['ret_1', 'ret_5', 'ret_15'],[ts_1min,ts_5min,ts_15min] ):
    imp_open = fin_data.loc[ts_post]['Open']
    imp_close = fin_data.loc[ts]['Close']
    hl = (imp_open - imp_close.values)/imp_open
    hl = hl.loc[~hl.index.duplicated(keep='first')]
    data_min[ret] = hl.values
    
data_min = data_min.dropna()

In [12]:
data_min.head()

Unnamed: 0_level_0,tweets,ret_1,ret_5,ret_15
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-11-08 03:09:00-06:00,"""https://t.co/z0I7wBsgTP",0.000162,0.000406,0.000406
2019-11-08 00:09:00-06:00,"""STATEMENT FROM PRESIDENT DONALD J. TRUMP http...",0.000162,0.000244,0.000244
2019-11-07 15:44:00-06:00,"""Stock Market up big today. A New Record. Enjoy!",-8.1e-05,0.0,0.0
2019-11-07 15:42:00-06:00,"""The Radical Left Dems and LameStream Media ar...",0.0,0.0,0.0
2019-11-07 14:52:00-06:00,"""ÒWhat did Hunter Biden do for the money?Ó @Se...",0.00073,0.00073,-8.1e-05


In [13]:
after_4_tweets = data_min.index.hour >= 15
data_min['after4_date'] = data_min.index
data_min.after4_date[after_4_tweets] +=  timedelta(days=1)
data_min.after4_date =data_min.after4_date.dt.date

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
mini = data_min.groupby('after4_date').min()
maxi = data_min.groupby('after4_date').max()
features = pd.DataFrame(index = mini.index, columns=['intra_ret_1', 'intra_ret_5', 'intra_ret_15'] )

In [15]:
for ind in mini.index: 
    for ret in ['ret_1', 'ret_5', 'ret_15']:
        if abs(mini[ret].loc[ind]) > abs(maxi[ret].loc[ind]):
            features['intra_' + ret].loc[ind] = mini[ret].loc[ind]
        else:
            features['intra_' + ret].loc[ind] = maxi[ret].loc[ind]
            

In [16]:
features.index.name = 'timestamp'
features['intra_blend'] = features.mean(axis=1)
features['intra_blend'].to_csv('features/intraday.csv')

In [17]:
features.loc[datetime.date(2019, 8, 24)]

intra_ret_1    -0.00325332
intra_ret_5    -0.00633078
intra_ret_15   -0.00527565
intra_blend    -0.00495325
Name: 2019-08-24, dtype: object

In [18]:
data_min.tail()

Unnamed: 0_level_0,tweets,ret_1,ret_5,ret_15,after4_date
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-11-16 11:29:00-06:00,"""I am not trying to get """"top level security c...",-0.000114,-0.000571,-0.000114,2016-11-16
2016-11-16 02:56:00-06:00,"""Very organized process taking place as I deci...",0.000228,0.000683,0.000455,2016-11-16
2016-11-15 13:41:00-06:00,"""The Electoral College is actually genius in t...",0.0,-0.000456,-0.001141,2016-11-15
2016-11-15 13:35:00-06:00,"""If the election were based on total popular v...",-0.000114,-0.000571,-0.000685,2016-11-15
2016-11-13 18:47:00-06:00,"""The debates especially the second and third p...",-0.000114,-0.000343,-0.000685,2016-11-14


In [19]:
data_min_sort = data_min.dropna().sort_values(by = 'ret_5')
print(data_min_sort.shape)
data_min_sort.head()

(5923, 5)


Unnamed: 0_level_0,tweets,ret_1,ret_5,ret_15,after4_date
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-08-23 14:58:00-05:00,"""As usual the Fed did NOTHING! It is incredibl...",-0.001938,-0.00793,-0.006696,2019-08-23
2019-08-23 15:00:00-05:00,"""....all deliveries of Fentanyl from China (or...",-0.003253,-0.006331,-0.005276,2019-08-24
2018-12-07 11:54:00-06:00,"""....Foundation be listed at the top of the Re...",-0.002064,-0.003846,-0.002626,2018-12-07
2018-12-21 14:42:00-06:00,"""There has never been a president who has been...",-0.001752,-0.003608,-0.002165,2018-12-21
2018-12-19 14:45:00-06:00,"""The Trump Foundation has done great work and ...",-0.00179,-0.00358,-0.00169,2018-12-19


In [22]:
data_min_sort.to_csv('results/sorted_trump.csv')

In [24]:
tweet_data.clean_tweets

Unnamed: 0_level_0,tweets,timestamp,after4_date
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-11-17 19:57:12-06:00,tell jennifer williams whoever that is to read...,2019-11-17 19:57:12-06:00,2019-11-18
2019-11-17 19:56:02-06:00,,2019-11-17 19:56:02-06:00,2019-11-18
2019-11-17 19:49:47-06:00,paul krugman of has been wrong about me from t...,2019-11-17 19:49:47-06:00,2019-11-18
2019-11-17 19:47:32-06:00,schiff is a corrupt politician,2019-11-17 19:47:32-06:00,2019-11-18
2019-11-17 19:30:09-06:00,blew the nasty amp obnoxious chris wallace wil...,2019-11-17 19:30:09-06:00,2019-11-18
2019-11-17 19:26:04-06:00,blew the nasty amp obnoxious chris wallace wil...,2019-11-17 19:26:04-06:00,2019-11-18
2019-11-17 18:34:46-06:00,thanks eric,2019-11-17 18:34:46-06:00,2019-11-18
2019-11-17 18:10:59-06:00,,2019-11-17 18:10:59-06:00,2019-11-18
2019-11-17 18:10:19-06:00,,2019-11-17 18:10:19-06:00,2019-11-18
2019-11-17 17:54:12-06:00,,2019-11-17 17:54:12-06:00,2019-11-18


# Futures Data Source

In [1]:
import pandas as pd
futs = pd.read_csv('E:/data/quandl/futures/futures.csv')

In [2]:
fut_names = sorted(list(set(futs['name'])))

In [18]:
with open('fut_names.txt','w') as f:
    for i in range(len(fut_names) ):
        f.write(fut_names[i] + '\n')

In [11]:
print(sorted(list(set(futs['symbol']))))

['AD', 'AL', 'ATW', 'B', 'BO', 'BP', 'C', 'CC', 'CD', 'CL', 'CT', 'CU', 'DA', 'DX', 'EC', 'ED', 'ES', 'FBTP', 'FDAX', 'FESX', 'FF', 'FGBL', 'FGBM', 'FGBS', 'FOAT', 'FV', 'G', 'GC', 'HG', 'HO', 'I', 'JY', 'KC', 'KW', 'L', 'LB', 'LC', 'LN', 'M', 'MD', 'MP', 'MW', 'NE', 'NG', 'NK', 'NQ', 'O', 'OJ', 'PA', 'PB', 'PL', 'R', 'RB', 'RF', 'RR', 'RS1', 'RTY', 'RU', 'S', 'SB', 'SF', 'SI', 'SM', 'SP', 'SXF', 'T', 'TF', 'TU', 'TY', 'US', 'VX', 'W', 'YM', 'Z', 'ZN']


In [33]:
futs[futs.name == 'CME Euro FX Futures #1 (EC1) - Front Month - Unadjusted Prices, Roll on Open Interest Switch'].head()

Unnamed: 0,quandl_code,name,exchange,symbol,depth,method,date,open,high,low,settle,volume,prev_day_open_interest,front_contract
31996,CME_EC1_ON,CME Euro FX Futures #1 (EC1) - Front Month - U...,CME,EC,1,ON,2018-06-18,1.1676,1.1705,1.16445,1.16955,181774,461883,ECU2018
134844,CME_EC1_ON,CME Euro FX Futures #1 (EC1) - Front Month - U...,CME,EC,1,ON,2018-06-19,1.17005,1.1724,1.16095,1.1653,294547,460251,ECU2018
344786,CME_EC1_ON,CME Euro FX Futures #1 (EC1) - Front Month - U...,CME,EC,1,ON,2018-06-20,1.1664,1.16765,1.1613,1.1663,228246,464777,ECU2018
650755,CME_EC1_ON,CME Euro FX Futures #1 (EC1) - Front Month - U...,CME,EC,1,ON,2018-06-21,1.16485,1.17085,1.1582,1.1698,325658,469309,ECU2018
669085,CME_EC1_ON,CME Euro FX Futures #1 (EC1) - Front Month - U...,CME,EC,1,ON,2018-06-22,1.168,1.17495,1.16745,1.17375,233222,474422,ECU2018


In [63]:

sorted(futs[futs.name == 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch'].date)

['2014-01-02',
 '2014-01-03',
 '2014-01-06',
 '2014-01-07',
 '2014-01-08',
 '2014-01-09',
 '2014-01-10',
 '2014-01-13',
 '2014-01-14',
 '2014-01-15',
 '2014-01-16',
 '2014-01-17',
 '2014-01-21',
 '2014-01-22',
 '2014-01-23',
 '2014-01-24',
 '2014-01-27',
 '2014-01-28',
 '2014-01-29',
 '2014-01-30',
 '2014-01-31',
 '2014-02-03',
 '2014-02-04',
 '2014-02-05',
 '2014-02-06',
 '2014-02-07',
 '2014-02-10',
 '2014-02-11',
 '2014-02-12',
 '2014-02-13',
 '2014-02-14',
 '2014-02-18',
 '2014-02-19',
 '2014-02-20',
 '2014-02-21',
 '2014-02-24',
 '2014-02-25',
 '2014-02-26',
 '2014-02-27',
 '2014-02-28',
 '2014-03-03',
 '2014-03-04',
 '2014-03-05',
 '2014-03-06',
 '2014-03-07',
 '2014-03-10',
 '2014-03-11',
 '2014-03-12',
 '2014-03-13',
 '2014-03-14',
 '2014-03-17',
 '2014-03-18',
 '2014-03-19',
 '2014-03-20',
 '2014-03-21',
 '2014-03-24',
 '2014-03-25',
 '2014-03-26',
 '2014-03-27',
 '2014-03-28',
 '2014-03-31',
 '2014-04-01',
 '2014-04-02',
 '2014-04-03',
 '2014-04-04',
 '2014-04-07',
 '2014-04-

In [56]:
xxx = futs[futs.symbol=='EC']
sorted(list(set(xxx[xxx.date=='2015-06-18'].name)))

['CME Euro FX Futures #1 (EC1) - Front Month - Backwards Panama Adjusted Prices, Roll on First of Month',
 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Panama Adjusted Prices, Roll on Last Trading Day',
 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Ratio Adjusted Prices, Roll on First of Month',
 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Ratio Adjusted Prices, Roll on Last Trading Day',
 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Ratio Adjusted Prices, Roll on Open Interest Switch',
 'CME Euro FX Futures #1 (EC1) - Front Month - Calendar-Weighted Adjusted Prices, Roll on First of Month',
 'CME Euro FX Futures #1 (EC1) - Front Month - Calendar-Weighted Adjusted Prices, Roll on Last Trading Day',
 'CME Euro FX Futures #1 (EC1) - Front Month - Forwards Panama Adjusted Prices, Roll on First of Month',
 'CME Euro FX Futures #1 (EC1)

In [1]:
instruments = [
'CBOT Wheat Futures #2 (W2) - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CBOT Soybeans Futures #2 (S2) - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CBOT Corn Futures #2 (C2) - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CBOT 30-year US Treasury Bond Futures #1 (US1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CBOT 10-year US Treasury Note Futures #1 (TY1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CME Japanese Yen JPY Futures #1 (JY1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CME Mexican Peso Futures #1 (MP1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'NYMEX Gold Futures #1 (GC1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'NYMEX WTI Crude Oil Futures #1 (CL1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CME S&P 500 Index E-Mini Futures #1 (ES1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CME NASDAQ 100 Index Mini Futures #1 (NQ1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'CME Canadian Dollar CAD Futures #1 (CD1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'EUREX Euro-Bund Futures #1 (FGBL1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
'EUREX DAX Futures #1 (FDAX1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch']

In [29]:
db = futs[futs.name.isin(instruments)]


In [30]:
db_final = db[['name', 'symbol', 'date', 'open', 'high', 'low', 'settle', 'volume']].sort_values(by=['name', 'date']).reset_index(drop=True)

In [31]:
list(set(db_final.name))

['CME S&P 500 Index E-Mini Futures #1 (ES1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'CBOT Corn Futures #2 (C2) - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'EUREX Euro-Bund Futures #1 (FGBL1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'NYMEX WTI Crude Oil Futures #1 (CL1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'CME Euro FX Futures #1 (EC1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'CME Japanese Yen JPY Futures #1 (JY1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'CBOT 30-year US Treasury Bond Futures #1 (US1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'CME Canadian Dollar CAD Futures #1 (CD1) - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch',
 'NYMEX Gold Futures #1 (GC1) - Front Month - Backwards

In [45]:
db_final_name = db_final.name.str.rstrip(' - Front Month - Backwards Panama Adjusted Prices, Roll on Open Interest Switch')

In [46]:
list(set(db_final_name))

['CBOT Wheat Futures #2 (W2)',
 'EUREX Euro-Bund Futures #1 (FGBL1)',
 'CBOT Corn Futures #2 (C2)',
 'CME Euro FX Futures #1 (EC1)',
 'CBOT 30-year US Treasury Bond Futures #1 (US1)',
 'CME NASDAQ 100 Index Mini Futures #1 (NQ1)',
 'CME Japanese Yen JPY Futures #1 (JY1)',
 'CME Mexican Peso Futures #1 (MP1)',
 'CME Canadian Dollar CAD Futures #1 (CD1)',
 'CBOT 10-year US Treasury Note Futures #1 (TY1)',
 'EUREX DAX Futures #1 (FDAX1)',
 'NYMEX Gold Futures #1 (GC1)',
 'NYMEX WTI Crude Oil Futures #1 (CL1)',
 'CME S&P 500 Index E-Mini Futures #1 (ES1)',
 'CBOT Soybeans Futures #2 (S2)']