## Necessary Imports

In [1]:
%matplotlib qt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob
import pickle

from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [2]:
from mlmodel.analysis import Analyser
from mlmodel.cross_validation import Purged_validation, HyperParameterTuning
from mlmodel.validation import Validation

## Object creation

In [3]:
from datagen import DataGen
dg=DataGen()

from visualize.visualize import Visualizer
vl=Visualizer()

from mlmodel.performanceMetrics import Metrics
met_ob = Metrics()

#model selection
from mlmodel.split import Split
from mlmodel.performanceMetrics import Metrics
from mlmodel.mlclassfier import MLClassifier
from mlmodel.sequential_bootstrap import sequentialBootstrap
from mlmodel.analysis import Analyser
from mlmodel.validation import Validation

# For ML
split_ob = Split()
metrics_ob = Metrics()
model_ob = MLClassifier()
sb_ob = sequentialBootstrap()
an_ob = Analyser()
val_ob = Validation()

val_ob = Validation()
hpt_ob = HyperParameterTuning()
pv_ob = Purged_validation()

Using TensorFlow backend.


## Inputs

In [4]:
folder_name='data/historical_price_data/BTCUSDT'
bar_type='time'             #type of bars possible_values: dollar,time,ticks,volume
threshold=300               #threshold for the given type of bar

#normalization
before=True                #flag that denotes normalizing before/after bars creation
normalize=True             #flag that specifies whether normalization should be done
norm_method='multiply'     #method for nomalization include 'multiply','min_max'
norm_val=100               #threshold for the above mentioned method


# Labels
volatility_threshold=20    #threshold in bars for volatility which is standard deviation of returns
sampling=False             #flag to control downsampling
v_bars_duration=20           #threshold in bars for vertical_bars which denotes a dataframe in triple-barrier method
barrier_conf=[2,4]          #stop loss and profit taking limits [0]denotes stop loss and [1]denotes profit taking
min_return=0                #minimum values for return in triple-barrier method
risk=0                      #risk for calculating sharp_ration
sign_label=True             #flag to determine labels of vertical bars t1b

# Features
sma_period = [10, 20] # [10, 15, 20]
ema_period = [10, 20] # [10, 15, 20]
BB_period  = [15]
rsi_period = [15]
williamsr_period = [15]
roc_period = [15]
adl_period = [15]
vpt_period = [0] # 0:  period is not required
emv_period = [0] # 0:  period is not required

feature_list = ['sma',      'ema',    'BB',       'rsi',     'williamsr',        'roc', 
                'adl',     'vpt',   'emv']   #feature list 
period_all =[sma_period, ema_period, BB_period, rsi_period, williamsr_period, roc_period, 
             adl_period, vpt_period, emv_period ]  # feature list period (change this if feature_list_changed)



## Data Generator

In [5]:
raw_data,labels,labels_features,train,test=dg.create_data(folder_name,feature_list,period_all,before,normalize,norm_val,norm_method,bar_type,threshold,sampling,volatility_threshold,v_bars_duration,
                            barrier_conf,min_return,risk,sign_label)

Labels:  -1.0    28143
 1.0    16019
 0.0    12291
Name: label, dtype: int64


In [None]:
#vl.marker_plot(labels)

In [6]:
train_X,train_y=dg.preprocess(train)
test_X,test_y=dg.preprocess(test)

In [7]:
_,sr,_,_=met_ob.sharpe_ratio(labels_features)
_,train_sr,_,_=met_ob.sharpe_ratio(train)
_,test_sr,_,_=met_ob.sharpe_ratio(test)

print("Train Sharpe Ratio :",train_sr)
print("Test Sharpe Ratio :",test_sr)

Train Sharpe Ratio : 0.24374659930821482
Test Sharpe Ratio : 0.2397809183742253


In [8]:
print("Total Sharpe Ratio :",sr)

Total Sharpe Ratio : 0.24196861695468128


In [62]:
def find_defective(temp_df,df):
    
    labels=temp_df.label
    
    pos_labs=labels[df.label==1.0]
    pos_labs[pos_labs==-1.0]=-2.0
    pos_labs[pos_labs==0.0]=10.0
    labels[pos_labs.index]=pos_labs
    
    neg_labs=labels[df.label==-1.0]
    neg_labs[neg_labs==1.0]=2.0
    neg_labs[neg_labs==0.0]=10.0
    labels[neg_labs.index]=neg_labs
    
    zero_labs=labels[df.label==0.0]
    zero_labs[zero_labs==1.0]=2.0
    zero_labs[zero_labs==-1.0]=-2.0
    labels[zero_labs.index]=zero_labs
    
    temp_df['label']=labels
    
    return temp_df

def calc_sharp_ratio(clf,df):
    preds=clf.predict(df)
    temp_df = df.copy(deep=True)
    temp_df['label']=preds
    _,sr,_,_=met_ob.sharpe_ratio(temp_df)
    
    return sr,temp_df

## ML

In [10]:
filename = 'svm_no_balance.sav'
try: 
    clf = joblib.load(filename)
except (OSError, IOError) as e:
    print(e)
    clf = svm.SVC(kernel='rbf')
    clf.fit(train_X, train_y,)
    joblib.dump(clf,filename)
    
acc_train = clf.score(train_X, train_y)
acc_test = clf.score(test_X, test_y)

sr,temp_df=calc_sharp_ratio(clf,test_X)

print( ' train: ',  acc_train, ' test: ',  acc_test, 's_r: ',sr)


 train:  1.0  test:  0.5344634103124446 s_r:  0.004933737960265892


In [11]:
sr,temp_df=calc_sharp_ratio(clf,test_X)
temp_df=find_defective(temp_df,test)
vl.compare_labels(test,temp_df,True)
print("Test Sharp Ratio:",sr)

Test Sharp Ratio: 0.004933737960265892


In [12]:
filename = 'svm_balanced_classes.sav'
try: 
    clf = joblib.load(filename)
except (OSError, IOError) as e:
    
    print(e)
    clf = svm.SVC(kernel='rbf',class_weight='balanced', C=1.0, random_state=0)
    clf.fit(train_X, train_y)
    joblib.dump(clf,filename)
    
acc_train = clf.score(train_X, train_y)
acc_test = clf.score(test_X, test_y)

sr,temp_df=calc_sharp_ratio(clf,test_X)

print( ' train: ',  acc_train, ' test: ',  acc_test, 's_r: ',sr)


 train:  1.0  test:  0.5344634103124446 s_r:  0.004933737960265892


In [13]:
sr,temp_df=calc_sharp_ratio(clf,test_X)
temp_df=find_defective(temp_df,test)
vl.compare_labels(test,temp_df,True)
print("Test Sharp Ratio:",sr)

Test Sharp Ratio: 0.004933737960265892


In [14]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(train_X),columns=train_X.columns)
X_test = pd.DataFrame(scaler.fit_transform(test_X),columns=test_X.columns)

X_train['label']=train.label
train_scaled=X_train.copy()
del X_train['label']

X_test['label']=test.label
test_scaled=X_test.copy()
del X_test['label']

In [15]:
filename = 'svm_no_balance_scaled.sav'
try: 
    clf = joblib.load(filename)
except (OSError, IOError) as e:
    print(e)
    clf = svm.SVC(kernel='rbf')
    clf.fit(X_train, train_y,)
    joblib.dump(clf,filename)
    
acc_train = clf.score(X_train, train_y)
acc_test = clf.score(X_test, test_y)

sr,temp_df=calc_sharp_ratio(clf,X_test)

print( ' train: ',  acc_train, ' test: ',  acc_test, 's_r: ',sr)


 train:  0.6395554880518428  test:  0.6741480125214104 s_r:  0.004816968433623166


In [16]:
sr,temp_df=calc_sharp_ratio(clf,X_test)
temp_df=find_defective(temp_df,test_scaled)
vl.compare_labels(test_scaled,temp_df,True)
print("Test Sharp Ratio:",sr)

Test Sharp Ratio: 0.004816968433623166


In [17]:
filename = 'svm_balanced_classes_scaled.sav'
try: 
    clf = joblib.load(filename)
except (OSError, IOError) as e:
    print(e)
    clf = svm.SVC(kernel='rbf',class_weight='balanced', C=1.0, random_state=0)
    clf.fit(X_train, train_y)
    joblib.dump(clf,filename)
    
acc_train = clf.score(X_train, train_y)
acc_test = clf.score(X_test, test_y)

sr,temp_df=calc_sharp_ratio(clf,X_test)

print( ' train: ',  acc_train, ' test: ',  acc_test, 's_r: ',sr)


 train:  0.6264935196435804  test:  0.5762801960900124 s_r:  0.0037417165950409613


In [18]:
sr,temp_df=calc_sharp_ratio(clf,X_test)
temp_df=find_defective(temp_df,test_scaled)
vl.compare_labels(test_scaled,temp_df,True)
print("Test Sharp Ratio:",sr)

Test Sharp Ratio: 0.0037417165950409613


In [19]:
temp_df.label.value_counts()

-1.0     6144
 10.0    3979
 1.0     2361
 2.0     2267
 0.0     1252
-2.0      928
Name: label, dtype: int64

In [20]:
test_scaled.label.value_counts()

-1.0    9049
 1.0    4755
 0.0    3127
Name: label, dtype: int64

In [72]:
X=pd.concat([X_train,X_test]).reset_index()
del X['index']
y=pd.concat([train_y,test_y]).reset_index()
del y['index']

In [38]:
metric = 'f1_micro'    
an_ob.check_overfitting(clf, X, y, scoring=metric)













The model is unable to overfit


In [39]:
an_frame=X.copy()
an_frame['label']=y
an_frame['label_pred']=clf.predict(X)

In [41]:
an_ob.check_dataset_correlation(an_frame)

In [52]:
clf = RandomForestClassifier(n_estimators=1, bootstrap=False,class_weight='balanced_subsample')
an_ob.check_overfitting(clf, X_train, train_y, scoring=metric)

The model is overfitting the training set


In [73]:
X

Unnamed: 0,close,sma_10,sma_20,ema_10,ema_20,BB_15,rsi_15,williamsr_15,roc_15,adl_15,vpt_0,emv_0
0,-0.803359,-0.798308,-0.802303,-0.799532,-0.801596,-0.799861,1.511174,-1.585620,0.390466,-0.400328,-0.012513,-0.032132
1,-0.803359,-0.798816,-0.801961,-0.800221,-0.801753,-0.801457,1.285101,-1.585620,-0.111914,-0.399700,-0.000730,-0.032885
2,-0.802062,-0.799086,-0.801554,-0.800548,-0.801772,-0.801463,0.225949,-0.353700,-0.084645,-0.398998,-0.000730,-0.032885
3,-0.797883,-0.798834,-0.800938,-0.800056,-0.801391,-0.801270,-0.199004,-1.047608,0.036893,-0.398290,-0.000672,2.217117
4,-0.797883,-0.799140,-0.800323,-0.799653,-0.801045,-0.801144,0.007462,0.306012,0.293585,-0.398985,-0.000730,2.217509
5,-0.794460,-0.798545,-0.799591,-0.798701,-0.800407,-0.800531,0.138540,-0.770012,0.198422,-0.399614,-0.000598,2.369777
6,-0.797181,-0.798582,-0.798939,-0.798417,-0.800088,-0.800412,0.051629,0.279789,0.097861,-0.400158,-0.000774,2.369790
7,-0.794526,-0.798354,-0.798557,-0.797702,-0.799547,-0.799755,0.259620,0.279789,0.326329,-0.399673,-0.000478,2.838289
8,-0.794526,-0.798125,-0.798219,-0.797117,-0.799058,-0.799213,0.441258,-0.366809,-0.018884,-0.399278,-0.000730,2.837570
9,-0.798738,-0.798317,-0.798201,-0.797404,-0.799016,-0.799687,-0.054065,-0.400598,0.065587,-0.398908,-0.000730,2.838759


In [74]:
tree_depth = 10
num_estimator = [10, 20, 50, 100, 200, 300]
avgU = 1.

for i in num_estimator:
    clf = tree.DecisionTreeClassifier(max_depth = tree_depth)
    clf = RandomForestClassifier(n_estimators=i, criterion='entropy', max_depth = tree_depth,  max_features=None) 
    clf_RF = clf.fit(X_train, train_y)
    acc_train_RF = clf_RF.score(X_train, train_y)
    acc_test_RF = clf_RF.score(X_test, test_y)
    
    sr,temp_df=calc_sharp_ratio(clf,X)
    
    print(i, ' train: ',  acc_train_RF, ' test: ',  acc_test_RF,' sharp_ratio:',sr)

      close    sma_10    sma_20    ema_10    ema_20     BB_15    rsi_15  \
0 -0.803359 -0.798308 -0.802303 -0.799532 -0.801596 -0.799861  1.511174   
1 -0.803359 -0.798816 -0.801961 -0.800221 -0.801753 -0.801457  1.285101   
2 -0.802062 -0.799086 -0.801554 -0.800548 -0.801772 -0.801463  0.225949   
3 -0.797883 -0.798834 -0.800938 -0.800056 -0.801391 -0.801270 -0.199004   
4 -0.797883 -0.799140 -0.800323 -0.799653 -0.801045 -0.801144  0.007462   

   williamsr_15    roc_15    adl_15     vpt_0     emv_0  label  
0     -1.585620  0.390466 -0.400328 -0.012513 -0.032132   -1.0  
1     -1.585620 -0.111914 -0.399700 -0.000730 -0.032885    1.0  
2     -0.353700 -0.084645 -0.398998 -0.000730 -0.032885    0.0  
3     -1.047608  0.036893 -0.398290 -0.000672  2.217117    0.0  
4      0.306012  0.293585 -0.398985 -0.000730  2.217509   -1.0  
10  train:  0.7126873228027542  test:  0.6537711889433583  sharp_ratio: 0.0042783956733773885


KeyboardInterrupt: 