In [1]:
## Creating Two Markov Chains, One for Days with Volume Jumps, and another for Volume Drops

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import io, base64, os, json, re 
import pandas as pd
import numpy as np
import datetime
from random import randint

In [3]:
NASDAQ_df = pd.read_csv('NASDAQ.csv')
NASDAQ_df['Date'] = pd.to_datetime(NASDAQ_df ['Date'])

cut_off_date = '2012-03-01' 
NASDAQ_df = NASDAQ_df[NASDAQ_df['Date'] >= cut_off_date]


NASDAQ_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-04-25,3013.649902,3031.409912,3010.570068,3029.629883,3029.629883,1721330000
1,2012-04-26,3029.620117,3056.77002,3027.790039,3050.610107,3050.610107,1763510000
2,2012-04-27,3060.340088,3076.439941,3043.300049,3069.199951,3069.199951,1777750000
3,2012-04-30,3060.060059,3063.659912,3043.25,3046.360107,3046.360107,1633170000
4,2012-05-01,3044.790039,3085.399902,3041.620117,3050.439941,3050.439941,1854230000


In [5]:
# take random sets of sequential rows 
new_set = []
for row_set in range(0, 100000):
    if row_set%2000==0: print(row_set)
    row_quant = randint(10, 30)
    row_start = randint(0, len(NASDAQ_df)-row_quant)
    market_subset = NASDAQ_df.iloc[row_start:row_start+row_quant]

    Close_Date = max(market_subset['Date'])
    if row_set%2000==0: print(Close_Date)
    
    # Close_Gap = (market_subset['Close'] - market_subset['Close'].shift(1)) / market_subset['Close'].shift(1)
    Close_Gap = market_subset['Close'].pct_change()
    High_Gap = market_subset['High'].pct_change()
    Low_Gap = market_subset['Low'].pct_change() 
    Volume_Gap = market_subset['Volume'].pct_change() 
    Daily_Change = (market_subset['Close'] - market_subset['Open']) / market_subset['Open']
    Outcome_Next_Day_Direction = (market_subset['Volume'].shift(-1) - market_subset['Volume'])
    
    new_set.append(pd.DataFrame({'Sequence_ID':[row_set]*len(market_subset),
                            'Close_Date':[Close_Date]*len(market_subset),
                           'Close_Gap':Close_Gap,
                           'High_Gap':High_Gap,
                           'Low_Gap':Low_Gap,
                           'Volume_Gap':Volume_Gap,
                           'Daily_Change':Daily_Change,
                           'Outcome_Next_Day_Direction':Outcome_Next_Day_Direction}))


0
2014-04-01 00:00:00
2000
2016-08-30 00:00:00
4000
2017-09-22 00:00:00
6000
2017-07-17 00:00:00
8000
2019-05-14 00:00:00
10000
2016-02-10 00:00:00
12000
2017-03-27 00:00:00
14000
2015-10-15 00:00:00
16000
2015-04-20 00:00:00
18000
2019-08-08 00:00:00
20000
2013-09-13 00:00:00
22000
2012-08-14 00:00:00
24000
2019-08-16 00:00:00
26000
2020-02-03 00:00:00
28000
2021-03-01 00:00:00
30000
2020-05-19 00:00:00
32000
2016-12-02 00:00:00
34000
2019-11-20 00:00:00
36000
2022-01-19 00:00:00
38000
2016-11-11 00:00:00
40000
2021-02-24 00:00:00
42000
2019-06-07 00:00:00
44000
2013-05-16 00:00:00
46000
2014-09-04 00:00:00
48000
2019-07-12 00:00:00
50000
2012-11-19 00:00:00
52000
2019-06-28 00:00:00
54000
2012-07-23 00:00:00
56000
2013-07-12 00:00:00
58000
2013-07-12 00:00:00
60000
2020-10-27 00:00:00
62000
2014-05-01 00:00:00
64000
2014-08-25 00:00:00
66000
2016-05-19 00:00:00
68000
2019-08-20 00:00:00
70000
2014-08-06 00:00:00
72000
2015-01-30 00:00:00
74000
2021-05-24 00:00:00
76000
2012-09-26 00:

In [6]:
len(market_subset)

14

In [9]:
new_set_df = pd.concat(new_set)
print(new_set_df.shape)
new_set_df = new_set_df.dropna(how='any') 
print(new_set_df.shape)
new_set_df.tail(20)

(1998151, 8)
(1798151, 8)


Unnamed: 0,Sequence_ID,Close_Date,Close_Gap,High_Gap,Low_Gap,Volume_Gap,Daily_Change,Outcome_Next_Day_Direction
19,99998,2012-06-04,-0.002855,0.006497,0.017465,-0.004582,-0.005182,67790000.0
20,99998,2012-06-04,0.003889,-0.004182,-0.009819,0.03641,0.006338,-179350000.0
21,99998,2012-06-04,-0.003768,0.001012,0.007913,-0.092945,-0.006011,-467610000.0
22,99998,2012-06-04,-0.000651,-0.004174,0.004309,-0.267161,-0.000778,-1061250000.0
23,99998,2012-06-04,0.011792,0.012831,0.005979,-0.827369,0.00621,1449610000.0
24,99998,2012-06-04,-0.011714,-0.012335,-0.007391,6.546584,-0.00348,509090000.0
25,99998,2012-06-04,-0.003531,-0.0017,-0.00838,0.304655,-0.003535,-213760000.0
26,99998,2012-06-04,-0.028246,-0.011364,-0.019529,-0.098049,-0.022294,-210620000.0
261,99999,2013-05-29,0.00804,0.002351,0.002398,-0.074739,0.006366,-74220000.0
262,99999,2013-05-29,0.000643,0.003055,0.00442,-0.043924,0.0027,205010000.0


In [10]:
# confirm sequence
# new_set_df[new_set_df['Close_Date'] == '1973-06-27'] {HLH, HLH, HHH, HHH, LLL, LML, LML, LLL, LHL, ...	

# create sequences
# simplify the data by binning values into three groups
 
# Close_Gap
new_set_df['Close_Gap_LMH'] = pd.qcut(new_set_df['Close_Gap'], 3, labels=["L", "M", "H"])

# High_Gap - not used in this example
new_set_df['High_Gap_LMH'] = pd.qcut(new_set_df['High_Gap'], 3, labels=["L", "M", "H"])

# Low_Gap - not used in this example
new_set_df['Low_Gap_LMH'] = pd.qcut(new_set_df['Low_Gap'], 3, labels=["L", "M", "H"])

# Volume_Gap
new_set_df['Volume_Gap_LMH'] = pd.qcut(new_set_df['Volume_Gap'], 3, labels=["L", "M", "H"])
 
# Daily_Change
new_set_df['Daily_Change_LMH'] = pd.qcut(new_set_df['Daily_Change'], 3, labels=["L", "M", "H"])

# new set
new_set_df = new_set_df[["Sequence_ID", 
                         "Close_Date", 
                         "Close_Gap_LMH", 
                         "Volume_Gap_LMH", 
                         "Daily_Change_LMH", 
                         "Outcome_Next_Day_Direction"]]

new_set_df['Event_Pattern'] = new_set_df['Close_Gap_LMH'].astype(str) + new_set_df['Volume_Gap_LMH'].astype(str) + new_set_df['Daily_Change_LMH'].astype(str)
 

In [11]:
new_set_df.tail(10)

Unnamed: 0,Sequence_ID,Close_Date,Close_Gap_LMH,Volume_Gap_LMH,Daily_Change_LMH,Outcome_Next_Day_Direction,Event_Pattern
263,99999,2013-05-29,H,H,H,23390000.0,HHH
264,99999,2013-05-29,M,M,H,101850000.0,MMH
265,99999,2013-05-29,M,H,L,-117150000.0,MHL
266,99999,2013-05-29,H,L,H,-83350000.0,HLH
267,99999,2013-05-29,M,M,M,31520000.0,MMM
268,99999,2013-05-29,M,M,M,402550000.0,MMM
269,99999,2013-05-29,L,H,L,-358660000.0,LHL
270,99999,2013-05-29,M,L,H,-371460000.0,MLH
271,99999,2013-05-29,M,L,H,298860000.0,MLH
272,99999,2013-05-29,H,H,L,46580000.0,HHL


In [12]:
new_set_df['Outcome_Next_Day_Direction'].describe()

count    1.798151e+06
mean     1.092118e+06
std      4.984430e+08
min     -3.722700e+09
25%     -1.663200e+08
50%     -6.430000e+06
75%      1.666200e+08
max      4.320700e+09
Name: Outcome_Next_Day_Direction, dtype: float64

In [13]:
# reduce the set
compressed_set = new_set_df.groupby(['Sequence_ID', 
                                     'Close_Date'])['Event_Pattern'].apply(lambda x: "{%s}" % ', '.join(x)).reset_index()

print(compressed_set.shape)
compressed_set.head()

(100000, 3)


Unnamed: 0,Sequence_ID,Close_Date,Event_Pattern
0,0,2014-04-01,"{HHH, MLM, MML, LML, MMM, LHL, MLH, LHL, LLM, ..."
1,1,2014-12-10,"{LML, HMH, MHL, HLH, MHM, HLH, MLM, LHL, HMH, ..."
2,2,2013-04-25,"{MLM, LHH, HLH, HHM, HHH, MMM, MLM, LHL, HLH, ..."
3,3,2020-12-04,"{LHL, HHH, LML, MML, HHH, HLM, HLH, MHL, HLM, ..."
4,4,2016-12-06,"{HLH, LML, HLH, MLH, HMH, LLL, HLH, MHM, MLM, ..."


In [14]:
#compressed_outcomes = new_set_df[['Sequence_ID', 'Close_Date', 'Outcome_Next_Day_Direction']].groupby(['Sequence_ID', 'Close_Date']).agg()

compressed_outcomes = new_set_df.groupby(['Sequence_ID', 'Close_Date'])['Outcome_Next_Day_Direction'].mean()
compressed_outcomes = compressed_outcomes.to_frame().reset_index()
print(compressed_outcomes.shape)
compressed_outcomes.describe()

(100000, 3)


Unnamed: 0,Sequence_ID,Outcome_Next_Day_Direction
count,100000.0,100000.0
mean,49999.5,1090473.0
std,28867.657797,47412230.0
min,0.0,-525846200.0
25%,24999.75,-16084170.0
50%,49999.5,-83846.15
75%,74999.25,16837780.0
max,99999.0,553883800.0
