## Necessary Imports

In [1]:
%matplotlib qt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob
import pickle

from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier



from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)


## Object creation

In [2]:
from datagen import DataGen
dg=DataGen()

from visualize.visualize import Visualizer
vl=Visualizer()

from mlmodel.performanceMetrics import Metrics
met_ob = Metrics()

#model selection
from mlmodel.split import Split
from mlmodel.performanceMetrics import Metrics
from mlmodel.mlclassfier import MLClassifier
from mlmodel.sequential_bootstrap import sequentialBootstrap
from mlmodel.analysis import Analyser
from mlmodel.validation import Validation

# For ML
split_ob = Split()
metrics_ob = Metrics()
model_ob = MLClassifier()
sb_ob = sequentialBootstrap()
an_ob = Analyser()
val_ob = Validation()

Using TensorFlow backend.


## Inputs

## Data Generator

In [3]:
## Necessary Imports

# %matplotlib qt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob
import pickle

from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)


## Object creation

from datagen import DataGen
dg=DataGen()

from visualize.visualize import Visualizer
vl=Visualizer()

from mlmodel.performanceMetrics import Metrics
met_ob = Metrics()

#model selection
from mlmodel.split import Split
from mlmodel.performanceMetrics import Metrics
from mlmodel.mlclassfier import MLClassifier
from mlmodel.sequential_bootstrap import sequentialBootstrap
from mlmodel.analysis import Analyser
from mlmodel.validation import Validation

# For ML
split_ob = Split()
metrics_ob = Metrics()
model_ob = MLClassifier()
sb_ob = sequentialBootstrap()
an_ob = Analyser()
val_ob = Validation()

## Inputs

folder_name='data/historical_price_data/BTCUSDT'
bar_type='time'             #type of bars possible_values: dollar,time,ticks,volume
threshold=300               #threshold for the given type of bar

#normalization
before=True                #flag that denotes normalizing before/after bars creation
normalize=True             #flag that specifies whether normalization should be done
norm_method='multiply'     #method for nomalization include 'multiply','min_max'
norm_val=100               #threshold for the above mentioned method

#split
split=0.7 


# Labels
volatility_threshold=20    #threshold in bars for volatility which is standard deviation of returns
sampling=False             #flag to control downsampling
v_bars_duration=20           #threshold in bars for vertical_bars which denotes a dataframe in triple-barrier method
barrier_conf=[2,4]          #stop loss and profit taking limits [0]denotes stop loss and [1]denotes profit taking
min_return=0                #minimum values for return in triple-barrier method
risk=0                      #risk for calculating sharp_ration
sign_label=True             #flag to determine labels of vertical bars t1b

# Features
sma_period = [15, 20, 30] # [10, 15, 20]
ema_period = [15 , 20, 30] # [10, 15, 20]
BB_period  = []
rsi_period = []
williamsr_period = []
roc_period = []
adl_period = []
vpt_period = [] # 0:  period is not required
emv_period = [] # 0:  period is not required

feature_list = ['sma',      'ema',    'BB',       'rsi',     'williamsr',        'roc', 
                'adl',     'vpt',   'emv']   #feature list 
period_all =[sma_period, ema_period, BB_period, rsi_period, williamsr_period, roc_period, 
             adl_period, vpt_period, emv_period ]  # feature list period (change this if feature_list_changed)



## Data Generator

raw_data,labels,labels_features,train,test=dg.create_data(folder_name,feature_list,period_all,before,normalize,norm_val,norm_method,bar_type,threshold,sampling,volatility_threshold,v_bars_duration,barrier_conf,min_return,risk,sign_label)

labels_features.head()

Labels:  -1.0    28143
 1.0    16019
 0.0    12291
Name: label, dtype: int64


Unnamed: 0,close,sma_15,sma_20,sma_30,ema_15,ema_20,ema_30,label,date
0,50.212924,50.256718,50.248472,50.105607,50.239862,50.200654,50.123474,-1.0,2017-08-17 06:25:00
1,50.315539,50.256586,50.2521,50.128261,50.249321,50.211596,50.135865,-1.0,2017-08-17 06:30:00
2,50.61395,50.276349,50.273427,50.160863,50.2949,50.249915,50.166709,-1.0,2017-08-17 06:35:00
3,50.61395,50.296111,50.297386,50.193464,50.334781,50.284585,50.195564,-1.0,2017-08-17 06:40:00
4,50.435392,50.303969,50.298219,50.220114,50.347357,50.298948,50.211036,-1.0,2017-08-17 06:45:00


In [4]:
ml_normalize = True

if ml_normalize:
    # Normalize Data
    mean_df = labels_features.iloc[:, :-2].mean()
    std_df = labels_features.iloc[:, :-2].std()

    train_X_unNom,train_y = dg.preprocess(train)
    test_X_unNom ,test_y  = dg.preprocess(test)
    
    train_X = pd.DataFrame((train_X_unNom-mean_df)/std_df)
    test_X  = pd.DataFrame((test_X_unNom -mean_df)/std_df)
    train_cl, test_cl = train_X['close'],test_X['close']
else:
    # Un-Normalized train_X, test_X
    train_X,train_y = dg.preprocess(train)
    test_X ,test_y  = dg.preprocess(test)

# ML

In [5]:
# Check Correlation of Labels and all features 
# Dropping Date column from labels_features
# an_ob.check_dataset_correlation(labels_features)

In [6]:
# Class balancing is off
if True:
    clf = tree.DecisionTreeClassifier(max_depth=10)
    bag = BaggingClassifier(base_estimator=clf,n_estimators=30)

    sfi_metric_df = val_ob.SFI_custom(bag,train_X,train_y,test_X,test_y,drop_close=True,sfi_flag=True)

Train Labels and Count
-1.0    19082
 1.0    11256
 0.0     9160
Name: label, dtype: int64
Test Labels and Count
-1.0    9046
 1.0    4755
 0.0    3127
Name: label, dtype: int64
*** All features Fit and predict score ***
std =  2.2928245528680717
Index(['sma_15', 'sma_20', 'sma_30', 'ema_15', 'ema_20', 'ema_30'], dtype='object')
train: 0.5827130487619626  test: 0.4844045368620038  sr: 0.010833470699440429
*** Single Feature Importance ***
std =  2.292877179229324
feature: sma_15  train: 0.5400526608942225  test: 0.4792060491493384  sr: 0.009193966157815705
-1.0    13133
 1.0     3448
 0.0      347
dtype: int64
std =  2.292855969398092
feature: sma_20  train: 0.5407362398096106  test: 0.4842273156899811  sr: 0.011324173217611568
-1.0    13964
 1.0     2665
 0.0      299
dtype: int64
std =  2.292467128915585
feature: sma_30  train: 0.5408628284976454  test: 0.48617674858223064  sr: 0.010873633925111766
-1.0    13826
 1.0     2902
 0.0      200
dtype: int64
std =  2.2927784690409223
featu