## Necessary Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob
import pickle

from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import normalize


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

## Inputs

In [2]:
# Load Data
data_folder_name='data/historical_price_data/ADABTC'   #the folder where data is stored
bar_type='time'             #type of bars possible_values: dollar,time,ticks,volume
threshold=60              #threshold for the given type of bar

#normalization
before=True

# Labels
volatility_threshold=10    #threshold in bars for volatility which is standard deviation of returns
sampling=False
v_bars_duration=1           #threshold in bars for vertical_bars which denotes a dataframe in triple-barrier method
barrier_conf=[1,1]          #stop loss and profit taking limits [0]denotes stop loss and [1]denotes profit taking
min_return=0                #minimum values for return in triple-barrier method
risk=0
sign_label=True
visualize_plot=False        #flag for visualizing plots

# Features
sma_period = [10, 20] # [10, 15, 20]
ema_period = [10, 20] # [10, 15, 20]
BB_period  = [15]
rsi_period = [15]
williamsr_period = [15]
roc_period = [15]
adl_period = [15]
vpt_period = [0] # 0:  period is not required
emv_period = [0] # 0:  period is not required

feature_list = ['sma',      'ema',    'BB',       'rsi',     'williamsr',        'roc', 
                'adl',     'vpt',   'emv']   #feature list 
period_all =[sma_period, ema_period, BB_period, rsi_period, williamsr_period, roc_period, 
             adl_period, vpt_period, emv_period ]  # feature list period (change this if feature_list_changed)


## Input Used
pkl_rawData='save_runs/MAIN_df_' + data_folder_name[27:]   ## Load data pickle file name
pkl_bars="save_runs/cstk_"+bar_type+"_"+str(threshold)+'_'+data_folder_name[27:] ## create bars
pkl_labels=pkl_bars+'_'+str(volatility_threshold)+"_"+str(v_bars_duration)+"_"+str(barrier_conf)+"_"+str(min_return)+"_labels"



## Loading Necessary Classes

In [3]:
#loading and preprocessing
from load_data.loadData import LoadData
from preprocessing.preProcessData import PreProcessData
from csticks.createcandleStick import createCandleStick

#normalize 
from normalize.norm import Normalizer

#labeling
from labelling.labelgenerator import LabelGenerator


# Features
from feature.featureExtraction import FeatureExtraction
from feature.featureExtractionVisual import FVisual
from feature.featureVerifyVisual import VerifyFeature

#model selection
from mlmodel.split import Split
from mlmodel.performanceMetrics import Metrics
from mlmodel.mlclassfier import MLClassifier
from mlmodel.sequential_bootstrap import sequentialBootstrap
from mlmodel.analysis import Analyser
from mlmodel.validation import Validation

############################ Object Creation ############################
#for normalization
nl_data=Normalizer()

# For Labels
ld_data = LoadData()
pp_data = PreProcessData()
cstk_ob = createCandleStick()
lbl_ob=LabelGenerator()

# For Features
fe_ob = FeatureExtraction()
fe_vis = FVisual()
fe_verify = VerifyFeature()

# For ML
split_ob = Split()
metrics_ob = Metrics()
model_ob = MLClassifier()
sb_ob = sequentialBootstrap()
an_ob = Analyser()
val_ob = Validation()

Using TensorFlow backend.


## Load Data and Labels

In [4]:
try:
    MAIN_df = pd.read_pickle(pkl_rawData)
    
except (OSError, IOError) as e:
    print(e)
    MAIN_df = ld_data.load_data_dir(data_folder_name)
    MAIN_df['Price'] = MAIN_df.loc[:,['Close']]
    MAIN_df.to_pickle(pkl_rawData)

In [5]:
MAIN_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Price
0,2017-11-30 12:29:00.000000,9e-06,9e-06,9e-06,9e-06,1064.0,9e-06
1,2017-11-30 12:30:00.000000,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2,2017-11-30 12:31:00.000000,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
3,2017-11-30 12:32:00.000000,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
4,2017-11-30 12:33:00.000000,8.9e-05,8.9e-05,8.9e-05,8.9e-05,1311.0,8.9e-05


In [6]:
# Create Bars
try:
    cstk_df = pd.read_pickle(pkl_bars)
except (OSError, IOError) as e:
    print(e)
    cstk_df = cstk_ob.createBars(MAIN_df,bar_type,threshold,0)
    cstk_df.to_pickle(pkl_bars)


In [7]:
cstk_df.head()

Unnamed: 0_level_0,DateStop,Open,High,Low,Close,Volume,Price
DateStart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-30 12:29:00,2017-11-30 12:29:00,9e-06,9e-06,9e-06,9e-06,1064.0,9e-06
2017-11-30 12:30:00,2017-11-30 12:30:00,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2017-11-30 12:31:00,2017-11-30 12:31:00,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2017-11-30 12:32:00,2017-11-30 12:32:00,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2017-11-30 12:33:00,2017-11-30 12:33:00,8.9e-05,8.9e-05,8.9e-05,8.9e-05,1311.0,8.9e-05


In [8]:
try:
    full_df = pd.read_pickle(pkl_labels)
except (OSError, IOError) as e:
    print(e)
    full_df = lbl_ob.get_barrier_labels(cstk_df,sampling,volatility_threshold,v_bars_duration,
                                        barrier_conf,min_return,risk,sign_label)
    full_df.to_pickle(pkl_labels)
    

In [9]:
positive=full_df[full_df['label']==1.0]
negative=full_df[full_df['label']==-1.0]
neutral=full_df[full_df['label']==0.0]

non_zero_labels=full_df[full_df['label']!=0.0]
print('Labels: ', non_zero_labels.label.value_counts())

Labels:  -1.0    110354
 1.0    109748
Name: label, dtype: int64


In [10]:
full_df=full_df.dropna()
full_df.tail(10)

Unnamed: 0_level_0,close,return,volatility,type,vbars,ret,label,sharp_ratio,profit
DateStart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-06-07 09:57:00,2.8e-05,0.0,0.00049,tp,2018-06-07 09:58:00,0.000723,1.0,0.0,0.0
2018-06-07 09:58:00,2.8e-05,0.000723,0.00048,sl,2018-06-07 09:59:00,-0.001444,-1.0,1.506721,4.928039e+80
2018-06-07 09:59:00,2.8e-05,-0.001444,0.00085,t1,2018-06-07 10:00:00,0.000362,1.0,-1.69893,0.0
2018-06-07 10:00:00,2.8e-05,0.000362,0.000781,t1,2018-06-07 10:01:00,0.000361,1.0,0.463048,0.0
2018-06-07 10:01:00,2.8e-05,0.000361,0.000715,tp,2018-06-07 10:02:00,0.000723,1.0,0.505503,0.0
2018-06-07 10:02:00,2.8e-05,0.000723,0.000689,t1,2018-06-07 10:03:00,0.0,0.0,1.048636,0.0
2018-06-07 10:03:00,2.8e-05,0.0,0.000631,sl,2018-06-07 10:04:00,-0.000722,-1.0,0.0,4.935166e+80
2018-06-07 10:04:00,2.8e-05,-0.000722,0.000683,sl,2018-06-07 10:05:00,-0.000723,-1.0,-1.057406,4.935166e+80
2018-06-07 10:05:00,2.8e-05,-0.000723,0.00069,tp,2018-06-07 10:06:00,0.001085,1.0,-1.047734,0.0
2018-06-07 10:06:00,2.8e-05,0.001085,0.000789,t1,2018-06-07 10:07:00,-0.000361,-1.0,1.374018,4.940518999999999e+80


# Feature Extraction

In [11]:
# cstk_df['Date'] = cstk_df.index
# cstk_df = cstk_df.reset_index()  ############ Code only works when first column name is DateStart and also
                                 # there is no column with name Date
cstk_df.head()

Unnamed: 0_level_0,DateStop,Open,High,Low,Close,Volume,Price
DateStart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-11-30 12:29:00,2017-11-30 12:29:00,9e-06,9e-06,9e-06,9e-06,1064.0,9e-06
2017-11-30 12:30:00,2017-11-30 12:30:00,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2017-11-30 12:31:00,2017-11-30 12:31:00,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2017-11-30 12:32:00,2017-11-30 12:32:00,9e-06,9e-06,9e-06,9e-06,0.0,9e-06
2017-11-30 12:33:00,2017-11-30 12:33:00,8.9e-05,8.9e-05,8.9e-05,8.9e-05,1311.0,8.9e-05


In [12]:
full_fdf = pd.DataFrame({'Close':cstk_df['Close']})
full_fdf =  full_fdf.reset_index()
full_fdf = full_fdf.rename(columns = {"DateStart": "Date"}) 

for i in range(len(feature_list)):
    for j in range(len(period_all[i])):        
        
        if feature_list[i] == 'sma':
            df_temp = fe_ob.simple_moving_avg(cstk_df,period_all[i][j],dropna=False)
            
        if feature_list[i] == 'ema':
            df_temp = fe_ob.exp_moving_avg(cstk_df,period_all[i][j],dropna=False)
            
        if feature_list[i] == 'BB':
            df_temp = fe_ob.bollinger_bands(cstk_df,period_all[i][j],dropna=False)
            
        if feature_list[i] == 'rsi':
            df_temp = fe_ob.rsi(cstk_df, col='Price', period = period_all[i][j],dropna=False)
            
        if feature_list[i] == 'williamsr':
            df_temp = fe_ob.willamsr(cstk_df,period = period_all[i][j])
            
        if feature_list[i] == 'roc':
            df_temp = fe_ob.roc(cstk_df,col_name='Close',period = period_all[i][j], dropna=False)
            
        if feature_list[i] == 'adl':
            df_temp = fe_ob.ad_oscillaor(cstk_df,period_all[i][j]) # check divisions
        
        if feature_list[i] == 'vpt':
            df_temp = fe_ob.vpt(cstk_df,dropna=False)
        
        if feature_list[i] == 'emv':
            df_temp = fe_ob.emv(cstk_df,dropna=False)
        
        # Adding Column to data frame
        col_name_temp = feature_list[i] + '_' + str(period_all[i][j])         
        if feature_list[i] == 'BB':
            df_temp2 = pd.DataFrame({col_name_temp:df_temp[0]['Close']}) # Only adding BB up
        else:
            df_temp2 = pd.DataFrame({col_name_temp:df_temp['Close']})
        full_fdf = pd.concat([full_fdf, df_temp2], axis=1)
        
    
full_fdf=full_fdf.set_index('Date')

# Adding labels to features
a=full_fdf.index.searchsorted(non_zero_labels.index)
df=full_fdf.iloc[a].dropna()
df['label']=non_zero_labels.label
df.head()

# Saving Later


KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.to_pickle('saved_runs/unnormalised_features')

In [None]:
full_df.to_pickle('')

# ML




In [None]:
# Data Preparation
df = df.dropna()
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X = pd.DataFrame(normalize(X))
train_X, train_y, test_X, test_y = split_ob.train_test_split(X, y, 0.7) # split training-testing data

#change v_bars to events['time']
# indM = sb_ob.getIndMatrix(df.index, v_bars)
# avgU = sb_ob.getAvgUniqueness(indM)
# avgU = avgU.mean()

# clf_RF, accuracy_RF = model_ob.ml_classfr(X, y, 1.0, 'RF', saveModel=True)

# Fit Decision Tree

In [None]:
tt = [5, 10, 15]
for i in tt:
    clf = tree.DecisionTreeClassifier(max_depth = i)
    clf = clf.fit(train_X, train_y)
    acc_train = clf.score(train_X, train_y)
    acc_test = clf.score(test_X, test_y)
    
    print(i, ' train: ',  acc_train, ' test: ',  acc_test)
    
# We can clearly see from the result below that we are overfitting after depth = 10 for deecison tree.
# So, our goal is to improve the test accuracy without overfittig and also not adding more data (for now).
# Lets try ensemble

In [None]:
y_pred=clf.predict(test_X)

In [None]:
test_X['Predicted']=y_pred

In [None]:
test_X

# Bagging to reduce overfitting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

tree_depth = 10
num_estimator = [10, 20, 50, 100]
avgU = 1.

for i in num_estimator:
    clf = tree.DecisionTreeClassifier(max_depth = tree_depth)
    clf = BaggingClassifier(base_estimator=clf,n_estimators=i, max_samples=avgU,max_features=1.) 
    clf_bag = clf.fit(train_X, train_y)
    acc_train_bag = clf_bag.score(train_X, train_y)
    acc_test_bag = clf_bag.score(test_X, test_y)
    print(i, ' train: ',  acc_train_bag, ' test: ',  acc_test_bag)

In [None]:
# We can see above that bagging helps us to improve the test accuracy a little bit and helps with overfitting. 
# Now what should be our next step. Let see if learning curve give us some indication about that. 

In [None]:
# This is just a quick experiment to test what will happen if tree_depth > 10. We can see from the results
# that train accurancy is high, however test accuracy for above is better. SO tree_depth = 10 is good
tree_depth = 15
num_estimator = [10, 20, 50, 100]
avgU = 1.

for i in num_estimator:
    clf = tree.DecisionTreeClassifier(max_depth = tree_depth)
    clf = BaggingClassifier(base_estimator=clf,n_estimators=i, max_samples=avgU,max_features=1.) 
    clf_bag = clf.fit(train_X, train_y)
    acc_train_bag = clf_bag.score(train_X, train_y)
    acc_test_bag = clf_bag.score(test_X, test_y)
    print(i, ' train: ',  acc_train_bag, ' test: ',  acc_test_bag)

In [None]:
y_pred=clf_bag.predict(X)

# Learning Curve

In [None]:
tree_depth = 10
num_etimator = 100 # we found above that this is a good number

clf = tree.DecisionTreeClassifier(max_depth = tree_depth)
clf_bag = BaggingClassifier(base_estimator=clf,n_estimators=num_etimator, max_samples=avgU,max_features=1.) 
val_ob.learning_curve(clf_bag, X, y, scoring='accuracy', n_splits=5) # Using X and y, not train_X and train_y

In [None]:
# We can see that there is still a bit of gap between train and test score above.
# Let see if RF (that has more randomness than bagging) helps. 

# RF

In [None]:
# clf = RandomForestClassifier(n_estimators=1,criterion='entropy',bootstrap=False,class_weight='balanced_subsample')

tree_depth = 10
num_estimator = [10, 20, 50, 100, 200, 300]
avgU = 1.

for i in num_estimator:
    clf = tree.DecisionTreeClassifier(max_depth = tree_depth)
    clf = RandomForestClassifier(n_estimators=i, criterion='entropy', max_depth = tree_depth,  max_features=None) 
    clf_RF = clf.fit(train_X, train_y)
    acc_train_RF = clf_RF.score(train_X, train_y)
    acc_test_RF = clf_RF.score(test_X, test_y)
    print(i, ' train: ',  acc_train_RF, ' test: ',  acc_test_RF)
    
# The result below shows that test accuracy still can't be improved.
# So, it means that bagging is best till now. Can we use max_feature hyperparamter
# for improving RF further OR may be we can use boosting?


In [None]:
## Next questions to consider
# 1) For our case, there is a very small gap beteen train and test accuracy. We can 
# see that in our learning curves. This means that even if we increase the model complexity, 
# we may not be able to improve test accuracy. Thus, there is no point trying more fancier
# classifiers. 

# Can you prove me wrong? Try other fancier classifiers and see if my conclusion is right. Please show the results
# in our next meeting

## Next Question: Based on my conclusion above we are only left with adding more data. Lets do that

### Next question: I didn't use most of the things mentioned in the book. See what happens if we use that 

#### Next Question: Try the same excercise above with different metrics such as Profit, sharp ratio etc.

##### Next Question :THis is specific for Siba: TRy the above exercise for ADABTC data and you will see that 
# it is overfitting very soon.Can you guess the reason. Hint : It has to do with volatility and the 
# profits defined by volatility.

###### I expect all of you to think in such depth and then do these different experiments. This will not only
# help the project but will also help you to learn and develop in-depth understanding about machine learning.


In [None]:
a=[X,y,y_pred]

In [None]:
[X,y]