In [1]:
import fxcmpy
import pandas as pd
import numpy as np
import datetime as dt

# Allows for printing the whole data frame
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from pyti.accumulation_distribution import accumulation_distribution as ad
from pyti.aroon import aroon_up
from pyti.aroon import aroon_down
from pyti.average_true_range import average_true_range as atr
from pyti.chande_momentum_oscillator import chande_momentum_oscillator as cmo
from pyti.chaikin_money_flow import chaikin_money_flow as cmf 
from pyti.commodity_channel_index import commodity_channel_index as cci
from pyti.exponential_moving_average import exponential_moving_average as ema
from pyti.hull_moving_average import hull_moving_average as hma
from pyti.money_flow_index import money_flow_index as mfi
from pyti.on_balance_volume import on_balance_volume as obv
from pyti.simple_moving_average import simple_moving_average as sma
from pyti.stochastic import percent_k as percent_k
from pyti.stochastic import percent_d as percent_d
from pyti.smoothed_moving_average import smoothed_moving_average as smoothed_ma
from pyti.true_range import true_range as tr
from pyti.ultimate_oscillator import ultimate_oscillator as uo
from pyti.volatility import volatility as volat
from pyti.relative_strength_index import relative_strength_index as rsi
from pyti.williams_percent_r import williams_percent_r as wpr 

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import roc_auc_score 

In [2]:
#set connection
con = fxcmpy.fxcmpy(config_file='fxcm.cfg')

#get candle data 2016-01-01 to 2018-06-19
df = con.get_candles('GBP/JPY', period='D1',start= dt.datetime(2012, 1, 1),end = dt.datetime(2018, 7, 13))

#check connection
con.is_connected()

True

In [3]:
# Accumulation distribution
df['accum_dist'] = ad(df['askclose'], df['askhigh'], df['asklow'], df['tickqty'])

# Averagre true range
df['atr'] = atr(df['askclose'], 10)

# Chande momentum oscillator
df['cmo'] = cmo(df['askclose'], 10)

# chaikin_money_flow 
df['cmf'] = cmf(df['askclose'], df['askhigh'], df['asklow'], df['tickqty'], 10) 

# commodity channel index
df['cci'] = cci(df['askclose'], df['askhigh'], df['asklow'], 10)

# exponential moving average 
fast = 7
slow = 14

# EMA fast and slow calculation
df['ema_fast'] = ema(df['askclose'], fast)
df['ema_slow'] = ema(df['askclose'], slow)

# hull moving average 
df['hma_fast'] = hma(df['askclose'], fast)
df['hma_slow'] = hma(df['askclose'], slow)

# money flow index
df['mfi'] = mfi(df['askclose'], df['askhigh'], df['asklow'], df['tickqty'], 10)

#From Ben
df['obv'] = obv(df['askclose'], df['tickqty'])
df['sma'] = sma(df['askclose'], period = 10)
df['percent_k'] = percent_k(df['askclose'], period = 10)
df['percent_d'] = percent_d(df['askclose'], period = 10)
df['smoothed_ma'] = smoothed_ma(df['askclose'], period = 10)
df['true_range'] = tr(df['askclose'], period = 10)
df['ulti_osc'] = uo(df['askclose'], df['asklow'])
df['volatility'] = volat(df['askclose'], period = 10)
df['rsi'] = rsi(df['askclose'], period = 10)
df['williams'] = df['true_range'] = wpr(df['askclose']) 


In [4]:
#define trend = whether up or down during one day
df['trend'] = np.where(df['askclose'] > df['askopen'], 1, 0)

#create one lag 
df['trend_nextday'] = df['trend'].shift(-1)


###df is our first version of data###

In [5]:
df.describe()

#now df still has NA values, and random forest can't handle NA, so we need to remove them before running the model

Unnamed: 0,bidopen,bidclose,bidhigh,bidlow,askopen,askclose,askhigh,asklow,tickqty,accum_dist,atr,cmo,cmf,cci,ema_fast,ema_slow,hma_fast,hma_slow,mfi,obv,sma,percent_k,percent_d,smoothed_ma,true_range,ulti_osc,volatility,rsi,williams,trend,trend_nextday
count,1965.0,1965.0,1965.0,1965.0,1965.0,1965.0,1965.0,1965.0,1965.0,1965.0,1956.0,1956.0,1956.0,1956.0,1959.0,1952.0,1958.0,1950.0,1955.0,1965.0,1956.0,1956.0,1954.0,1965.0,1965.0,1911.0,1956.0,1955.0,1965.0,1965.0,1964.0
mean,155.331858,155.345825,156.038085,154.630928,155.448681,155.462672,156.092252,154.685821,228161.1,21043520.0,3.600191,3.892654,0.123472,0.248534,155.546331,155.645822,155.596051,155.739771,52.040159,855293.8,155.567137,0.534295,0.534292,155.34082,-51.532122,25.46096,1.13907,51.803897,-51.532122,0.509415,0.509674
std,19.86427,19.848747,19.865196,19.851422,19.871745,19.856199,19.867342,19.853428,171349.7,15210530.0,1.656653,44.917702,0.211728,7.055926,19.787216,19.70559,19.792252,19.703087,21.46954,3705510.0,19.772057,0.395038,0.360307,19.851893,25.514891,8.445319,0.717101,15.452982,25.514891,0.500039,0.500034
min,117.597,117.597,117.809,117.275,117.744,117.744,118.006,117.295,1.0,-20983.25,1.25606,-100.0,-0.481817,-51.023537,117.977806,118.582974,117.513889,119.269647,0.0,-9046245.0,118.4039,0.0,0.0,118.462088,-100.0,8.126196,0.110156,5.530313,-100.0,0.0,0.0
25%,141.054,141.063,141.811,140.359,141.196,141.206,141.889,140.41,80935.0,5789937.0,2.537838,-30.245978,-0.023687,-3.508715,141.257991,141.43906,141.526738,141.576148,36.076281,-1785387.0,141.379425,0.108165,0.168075,141.613936,-69.851713,20.196637,0.662592,40.450925,-69.851713,0.0,0.0
50%,151.577,151.577,152.383,150.94,151.731,151.731,152.425,151.006,223522.0,19238350.0,3.283759,5.47687,0.127678,0.410288,151.682712,151.62932,151.959494,152.039771,51.985056,1463822.0,151.5911,0.565251,0.575698,151.223842,-56.32726,24.576113,0.983286,51.79483,-56.32726,1.0,1.0
75%,172.171,172.171,172.662,171.71,172.288,172.288,172.724,171.749,351193.0,34194000.0,4.228486,37.708555,0.266898,4.388383,172.154686,172.187607,172.515363,172.52337,67.62437,3447399.0,172.19225,0.96399,0.892478,172.073388,-29.91185,29.50837,1.423083,62.903415,-29.91185,1.0,1.0
max,195.361,195.361,195.874,194.779,195.566,195.566,195.9,194.902,1056630.0,48364960.0,16.606854,100.0,0.708843,25.23167,195.082584,194.622301,196.074873,196.049887,100.0,10119080.0,194.9152,1.0,1.0,193.907515,-0.0,96.658905,7.768421,89.177244,-0.0,1.0,1.0


In [29]:
### Define Features to Train Model on ###
features = list(df.loc[:,['askopen','accum_dist','atr', 'cmo', 'cmf', 'cci', 'ema_fast', 'ema_slow',
                          'hma_fast', 'hma_slow', 'mfi', 'obv', 'sma', 'percent_k', 'percent_d',
                          'smoothed_ma', 'true_range', 'ulti_osc', 'volatility', 'rsi', 'williams']])
features

['askopen',
 'accum_dist',
 'atr',
 'cmo',
 'cmf',
 'cci',
 'ema_fast',
 'ema_slow',
 'hma_fast',
 'hma_slow',
 'mfi',
 'obv',
 'sma',
 'percent_k',
 'percent_d',
 'smoothed_ma',
 'true_range',
 'ulti_osc',
 'volatility',
 'rsi',
 'williams']

In [30]:
df.to_csv('MLmodel2.csv')

In [31]:
df = pd.read_csv('MLmodel2.csv')

In [32]:
df.shape

(1965, 33)

In [33]:
#Split into training and testing data

train = df[(pd.to_datetime(df.date) < pd.to_datetime('2017-01-01')) 
           & (pd.to_datetime(df.date) >= pd.to_datetime('2012-01-01'))]

test = df[pd.to_datetime(df.date) >= pd.to_datetime('2017-01-01')]

X_train = train.loc[:,features]
y_train = train.loc[:,'trend_nextday']
X_test = test.loc[:,features]
y_test = test.loc[:,'trend_nextday']


In [34]:
y_test = y_test.fillna(0)

In [35]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [36]:
from datetime import datetime as dt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, log_loss, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [37]:
### Remove null values and replace with the median of each column ###
transform = Imputer(missing_values='NaN',strategy='median')

### Find median of each column in training set and replace null values ###
X_train_features = transform.fit_transform(X_train)

X_train = pd.DataFrame(X_train_features, index =X_train.index, columns = X_train.columns)

### Apply median from training set to null values of test set ###
X_test_features = transform.transform(X_test)

X_test = pd.DataFrame(X_test_features, index = X_test.index, columns = X_test.columns)

In [38]:
X_train.shape

(1498, 21)

## Random Forest

In [39]:
### Train and fit classifier to Training Data ###
scores = {}

for ne in range(100, 600, 100):
    for md in range(3, 11):
        for mln in range(4, 100, 4):
            clf = RandomForestClassifier(n_estimators=ne, max_depth=md, max_leaf_nodes=mln, n_jobs=-1, verbose=0)

            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            y_pred.sum()

            metric = roc_auc_score(y_test, y_pred)
                
            
            scores[str(ne)+'_'+str(md)+'_'+str(mln)] = metric
            print(metric)

            ## binary classification 
            ## market data is really difficult to make these predictions, almost never get better than 75% accuracy. In Dr.Trevor Trinkino's prediction, he got accuracy around 50% as well.
            ## Curve: look at percetage of true positives and false negatives
            
            ## another way is to look at correct proportion of prediction 
            
            # there are other ways to search for hyperparameter more efficiently using a genetic algorithm or other similar techniques 
            


0.4902057884890503
0.48601481970580684
0.4796229045156084
0.49468104618319214
0.47534940024210415
0.47751366420894314
0.475395253292249
0.46895748505190565
0.4817504860423315
0.4667290268148637
0.4838780675690546
0.4799622170866806
0.4711767726789186
0.4734694251861634
0.4901415942188475
0.48180550970250546
0.4648949048090679
0.4817963390924765
0.4796870987858113
0.481979751293056
0.5009537434430139
0.46912255603242725
0.48208062800337476
0.48382304390888076
0.47734859322842155
0.47549613000256774
0.49521294156487294
0.4797237812259271
0.4881974248927038
0.4755511536627416
0.47094750742819413
0.45827372436814495
0.4711767726789186
0.47115843145886055
0.46486739297898094
0.48196141007299803
0.47964124573566636
0.48389640878911266
0.4776970764095228
0.47124096694912143
0.46473900443857524
0.47341440152598946
0.47318513627526504
0.4886284435640658
0.47566120098308945
0.49282858295733833
0.48817908367264584
0.4817871684824474
0.47294670041451153
0.485950625435604
0.4778071237298705
0.47569

0.48401562671948944
0.48182385092256336
0.479778804886101
0.4798888522064488
0.46271229962217086
0.46262976413191004
0.48407982098969227
0.4605205238252448
0.481970580683027
0.47344191335607644
0.48832581343310955
0.46661897949451603
0.4775411760390301
0.48415318586992406
0.4799622170866806
0.47125013755915046
0.4733777190858736
0.47559700671288646
0.4755328124426837
0.4711767726789186
0.4733502072557867
0.47125013755915046
0.47358864311654014
0.47767873518946474
0.47997138769670955
0.4732493305454679
0.4669674626756172
0.4755328124426837
0.48191555702285316
0.4584296247386377
0.4670408275558491
0.4778988298301603
0.4649315872491839
0.4690308499321375
0.48622574373647337
0.4666464913246029
0.4668023916950956
0.48610652580609665
0.47557866549282857
0.4755878361028576
0.45840211290855065
0.48630827922673414
0.4755878361028576
0.46477568687869114
0.4884817138036022
0.481979751293056
0.4605938887054767
0.48193389824291116
0.4691408972524852
0.47344191335607644
0.48843586075345735
0.4755328

0.4691867503026301
0.4842724038003008
0.4690400205421665
0.46486739297898094
0.4648949048090679
0.4733593778658156
0.47560617732291555
0.4669858038956751
0.47338688969590254
0.4819247276328822
0.4798613403763619
0.469241773962804
0.47343274274604746
0.47554198305271267
0.47356113128645316
0.4776970764095228
0.4948552877737427
0.4841348446498661
0.47338688969590254
0.4776237115292909
0.46661897949451603
0.477559517259088
0.4668940977953853
0.47549613000256774
0.4799071934265068
0.4691133854223983
0.47346025457613433
0.46498661090935767
0.48212648105351963
0.4671141924360808
0.47135101426946924
0.4799438758666227
0.4886926378342687
0.47343274274604746
0.4820072631231429
0.4884175195333993
0.4799438758666227
0.48411650342980816
0.4905542716701515
0.4798154873262169
0.47343274274604746
0.47997138769670955
0.4798338285462749
0.4627581526723158
0.47099336047833906
0.4839147500091706
0.4690767029822823
0.4756153479329445
0.4712226257290635
0.47551447122262575
0.47351527823630835
0.47992553464

In [None]:
clf.feature_importances_

In [None]:
### Feature Importances ###
importances = sorted(list(zip(clf.feature_importances_, new_features)),reverse=True)

# while len(importances) > 50:
importances = importances[:int(len(importances)*0.9)]
new_features = []
for i, f in importances:
    new_features.append(f)

In [None]:
trades = test.assign(pred = y_pred)
trades = trades[trades.pred > 0.5]

In [None]:
trades

## Logistic

In [40]:
#step 1 import model you want to use
from sklearn.linear_model import LogisticRegression

In [41]:
logisticRegr = LogisticRegression()

In [45]:

logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
predictions = logisticRegr.predict(X_test)

In [48]:
# Use score method to get accuracy of model# Use sc 
score = logisticRegr.score(X_test, y_test)
print(score)

0.4989293361884368
