## Necessary Imports

In [1]:
%matplotlib qt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob
import pickle

from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [2]:
from mlmodel.analysis import Analyser
from mlmodel.cross_validation import Purged_validation, HyperParameterTuning
from mlmodel.validation import Validation

## Object creation

In [3]:
from datagen import DataGen
test=DataGen()

from visualize.visualize import Visualizer
vl=Visualizer()

from mlmodel.performanceMetrics import Metrics
met_ob = Metrics()

#model selection
from mlmodel.split import Split
from mlmodel.performanceMetrics import Metrics
from mlmodel.mlclassfier import MLClassifier
from mlmodel.sequential_bootstrap import sequentialBootstrap
from mlmodel.analysis import Analyser
from mlmodel.validation import Validation

# For ML
split_ob = Split()
metrics_ob = Metrics()
model_ob = MLClassifier()
sb_ob = sequentialBootstrap()
an_ob = Analyser()
val_ob = Validation()

val_ob = Validation()
hpt_ob = HyperParameterTuning()
pv_ob = Purged_validation()

Using TensorFlow backend.


## Inputs

In [4]:
folder_name='data/historical_price_data/'
bar_type='time'             #type of bars possible_values: dollar,time,ticks,volume
threshold=300               #threshold for the given type of bar

#normalization
before=True                #flag that denotes normalizing before/after bars creation
normalize=True             #flag that specifies whether normalization should be done
norm_method='multiply'     #method for nomalization include 'multiply','min_max'
norm_val=100               #threshold for the above mentioned method


# Labels
volatility_threshold=20    #threshold in bars for volatility which is standard deviation of returns
sampling=False             #flag to control downsampling
v_bars_duration=20           #threshold in bars for vertical_bars which denotes a dataframe in triple-barrier method
barrier_conf=[2,4]          #stop loss and profit taking limits [0]denotes stop loss and [1]denotes profit taking
min_return=0                #minimum values for return in triple-barrier method
risk=0                      #risk for calculating sharp_ration
sign_label=True             #flag to determine labels of vertical bars t1b

# Features
sma_period = [10, 20] # [10, 15, 20]
ema_period = [10, 20] # [10, 15, 20]
BB_period  = [15]
rsi_period = [15]
williamsr_period = [15]
roc_period = [15]
adl_period = [15]
vpt_period = [0] # 0:  period is not required
emv_period = [0] # 0:  period is not required

feature_list = ['sma',      'ema',    'BB',       'rsi',     'williamsr',        'roc', 
                'adl',     'vpt',   'emv']   #feature list 
period_all =[sma_period, ema_period, BB_period, rsi_period, williamsr_period, roc_period, 
             adl_period, vpt_period, emv_period ]  # feature list period (change this if feature_list_changed)



## Data Generator

In [None]:
raw_data,labels,labels_features=test.create_data(folder_name,feature_list,period_all,before,normalize,norm_val,norm_method,bar_type,threshold,sampling,volatility_threshold,v_bars_duration,
                            barrier_conf,min_return,risk,sign_label)

Labels:  -1.0    29387
 1.0    15168
 0.0     9389
Name: label, dtype: int64
[Errno 2] No such file or directory: 'saved_runs/MAIN_df_BNBBTC'
[Errno 2] No such file or directory: 'saved_runs/MAIN_df_BNBBTC_normalized_main_df_True_True_100_multiply'
[Errno 2] No such file or directory: 'saved_runs/cstk_df_BNBBTC_time_300_True_True_100_multiply'


In [None]:
raw_data.head(2)

In [None]:
labels.head(2)

In [None]:
labels_features.head(2)

In [None]:
vl.marker_plot(labels)

In [None]:
df=labels_features
df = df.dropna()
df.head()

## ML

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
train_X, train_y, test_X, test_y = split_ob.train_test_split(X, y, 0.7) # split training-testing data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

rf = RandomForestClassifier(n_estimators=1, bootstrap=False,class_weight='balanced_subsample')
rf = BaggingClassifier(base_estimator=rf)
pctEmbargo=0.01

In [None]:
X = labels_features

X['Date'] = raw_data['DateStop']
X = X.set_index('Date')

t1 = pd.Series(raw_data['DateStop'])

In [None]:
pv_ob.learning_curve(rf, X, y, t1=t1, cv=5, pctEmbargo=pctEmbargo)

# Get predictions of test_X

In [None]:
y_pred=clf.predict(test_X)
pred_df = test_X.copy(deep=True)
pred_df['label']=y_pred

In [None]:
# calculate sharpe ratio for predicted labels
# pred_df is dataframe for calculating sharpe ratio
pred_df=met_ob.sharpe_ratio(pred_df)
pred_df.head()

# Get un-predicted labels and respective features of test_X

In [None]:
un_pred_df = test_X.copy(deep=True)
un_pred_df['label'] = test_y

In [None]:
# calculate sharpe ratio for un-predicted labels (the ones generated by triple barrier method)
# pred_df is dataframe for calculating sharpe ratio
un_pred_df=met_ob.sharpe_ratio(un_pred_df)
un_pred_df.head()

# Compare sharpe_ratio