## Machine learning Model of Price Trend Prediction in Forex Market

In [1]:
import fxcmpy
import pandas as pd
import numpy as np
import datetime as dt

# Allows for printing the whole data frame
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from pyti.accumulation_distribution import accumulation_distribution as ad
from pyti.aroon import aroon_up
from pyti.aroon import aroon_down
from pyti.average_true_range import average_true_range as atr
from pyti.chande_momentum_oscillator import chande_momentum_oscillator as cmo
from pyti.chaikin_money_flow import chaikin_money_flow as cmf 
from pyti.commodity_channel_index import commodity_channel_index as cci
from pyti.exponential_moving_average import exponential_moving_average as ema
from pyti.hull_moving_average import hull_moving_average as hma
from pyti.money_flow_index import money_flow_index as mfi
from pyti.on_balance_volume import on_balance_volume as obv
from pyti.simple_moving_average import simple_moving_average as sma
from pyti.stochastic import percent_k as percent_k
from pyti.stochastic import percent_d as percent_d
from pyti.smoothed_moving_average import smoothed_moving_average as smoothed_ma
from pyti.true_range import true_range as tr
from pyti.ultimate_oscillator import ultimate_oscillator as uo
from pyti.volatility import volatility as volat
from pyti.relative_strength_index import relative_strength_index as rsi
from pyti.williams_percent_r import williams_percent_r as wpr 

## Data Retrieval

GBP/JPY Price data from 2012 to 2018 is retrieved through FXCM's Restful API.

In [2]:
#set connection
con = fxcmpy.fxcmpy(config_file='fxcm.cfg')

#get candle data 2016-01-01 to 2018-06-19
df = con.get_candles('GBP/JPY', period='D1',start= dt.datetime(2012, 1, 1),end = dt.datetime(2018, 7, 23))

#check connection
con.is_connected()

True

## Feature Engineering

Gathering more data and feature engineering usually have the greatest payoff for improving model performance. In our model, we only gathered price data and generated features of technical indicators based on price. However, in more complexed models, trades data, economic indicators, sentiment data and cross market data, such as S&P 500, could all be incorporated. 

In [3]:
# Accumulation distribution
df['accum_dist'] = ad(df['askclose'], df['askhigh'], df['asklow'], df['tickqty'])

# Averagre true range
df['atr'] = atr(df['askclose'], 10)

# Chande momentum oscillator
df['cmo'] = cmo(df['askclose'], 10)

# chaikin_money_flow 
df['cmf'] = cmf(df['askclose'], df['askhigh'], df['asklow'], df['tickqty'], 10) 

# commodity channel index
df['cci'] = cci(df['askclose'], df['askhigh'], df['asklow'], 10)

# exponential moving average 
fast = 7
slow = 14
df['ema_fast'] = ema(df['askclose'], fast)
df['ema_slow'] = ema(df['askclose'], slow)

# hull moving average 
df['hma_fast'] = hma(df['askclose'], fast)
df['hma_slow'] = hma(df['askclose'], slow)

# money flow index
df['mfi'] = mfi(df['askclose'], df['askhigh'], df['asklow'], df['tickqty'], 10)

# on balance volume
df['obv'] = obv(df['askclose'], df['tickqty'])

# simple moving average
df['sma'] = sma(df['askclose'], period = 10)

# percent k
df['percent_k'] = percent_k(df['askclose'], period = 10)

# percent d
df['percent_d'] = percent_d(df['askclose'], period = 10)

# smoothed moving average
df['smoothed_ma'] = smoothed_ma(df['askclose'], period = 10)

# true range
df['true_range'] = tr(df['askclose'], period = 10)

# ultimate oscillator
df['ulti_osc'] = uo(df['askclose'], df['asklow'])

# volatility
df['volatility'] = volat(df['askclose'], period = 10)

# relative strength index
df['rsi'] = rsi(df['askclose'], period = 10)

# williams percent
df['williams'] = df['true_range'] = wpr(df['askclose']) 


In [4]:
##############################LOGIC PROBLEM: predict whether price close after 10 days > price close today? 

#define trend = whether up or down in ten days from now, if price(t) < price(t+10) , go up in 10 days, = 1, else = 0
df['trend'] = np.where(df['askclose'] < df['askclose'].shift(-10), 1, 0)

###df is our first version of data###

In [5]:
df.head(15)
#now df still has NA values, and random forest can't handle NA, so we need to remove them before running the model

Unnamed: 0_level_0,bidopen,bidclose,bidhigh,bidlow,askopen,askclose,askhigh,asklow,tickqty,accum_dist,atr,cmo,cmf,cci,ema_fast,ema_slow,hma_fast,hma_slow,mfi,obv,sma,percent_k,percent_d,smoothed_ma,true_range,ulti_osc,volatility,rsi,williams,trend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
2012-01-02 22:00:00,119.526,119.242,119.527,118.955,119.618,119.318,119.622,119.025,12277,0.0,,,,,,,,,,1.0,,,,119.318,-97.977436,,,,-97.977436,0
2012-01-03 22:00:00,119.242,120.033,120.161,119.154,119.318,120.091,120.194,119.233,52708,41409.510926,,,,,,,,,,52709.0,,,,119.724842,-96.984143,,,,-96.984143,0
2012-01-04 22:00:00,120.033,119.799,120.196,119.599,120.091,119.849,120.251,119.622,52090,26917.062595,,,,,,,,,,619.0,,,,119.770657,-97.295109,,,,-97.295109,0
2012-01-05 22:00:00,119.799,119.451,119.949,119.169,119.849,119.505,119.977,119.196,63691,13624.318677,,,,,,,,,,-63072.0,,,,119.693409,-97.737144,,,,-97.737144,0
2012-01-06 22:00:00,119.451,118.714,119.734,118.54,119.505,118.78,119.763,118.566,53869,-20983.25108,,,,,,,,,,-116941.0,,,,119.470359,-98.668757,,,,-98.668757,1
2012-01-09 22:00:00,118.714,118.702,118.866,118.254,118.78,118.903,118.903,118.289,48787,27803.74892,,,,,,,,,,-68154.0,,,,119.349273,-98.510704,,,,-98.510704,1
2012-01-10 22:00:00,118.702,118.961,119.063,118.697,118.903,119.023,119.088,118.744,53127,60853.684966,,,,,119.179937,,,,,-15027.0,,,,119.286733,-98.356506,,,,-98.356506,1
2012-01-11 22:00:00,118.961,117.789,119.109,117.711,119.023,117.841,119.133,117.739,56479,12639.904478,,,,,118.788321,,118.231548,,,-71506.0,,,,119.032888,-99.875357,,,,-99.875357,1
2012-01-12 22:00:00,117.789,117.662,118.108,117.512,117.841,117.75,118.131,117.535,62647,-4808.756595,,,,,118.438585,,117.676476,,,-134153.0,,,,118.823464,-99.99229,,,,-99.99229,1
2012-01-13 22:00:00,117.662,117.809,118.186,117.275,117.75,118.006,118.211,117.295,68173,32850.127684,2.341,-34.024896,0.062277,-4.298907,118.259462,,117.513889,,,-65980.0,118.9066,0.109355,,118.697955,-99.663334,,1.211476,,-99.663334,1


In [6]:
# Define Features to Train Model on 
features = list(df.loc[:,'askopen':'williams'])
features

['askopen',
 'askclose',
 'askhigh',
 'asklow',
 'tickqty',
 'accum_dist',
 'atr',
 'cmo',
 'cmf',
 'cci',
 'ema_fast',
 'ema_slow',
 'hma_fast',
 'hma_slow',
 'mfi',
 'obv',
 'sma',
 'percent_k',
 'percent_d',
 'smoothed_ma',
 'true_range',
 'ulti_osc',
 'volatility',
 'rsi',
 'williams']

In [7]:
#save to csv for convenience in future use
df.to_csv('MLmodel2.csv')

In [8]:
df = pd.read_csv('MLmodel2.csv')

In [9]:
df.shape

(1978, 31)

## Train and Test data Preparation

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, log_loss, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [11]:
#Split into training and testing data

train = df[(pd.to_datetime(df.date) < pd.to_datetime('2017-01-01')) 
           & (pd.to_datetime(df.date) >= pd.to_datetime('2012-01-01'))]

test = df[pd.to_datetime(df.date) >= pd.to_datetime('2017-01-01')]

X_train = train.loc[:,features]
y_train = train.loc[:,'trend']
X_test = test.loc[:,features]
y_test = test.loc[:,'trend']

In [12]:
#fill NA in y with 0
y_test = y_test.fillna(0)

In [13]:
#change float number to integer for classification purpose  
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [14]:
### Remove null values and replace with the median of each column ###
transform = Imputer(missing_values='NaN',strategy='median')

### Find median of each column in training set and replace null values ###
X_train_features = transform.fit_transform(X_train)

X_train = pd.DataFrame(X_train_features, index =X_train.index, columns = X_train.columns)

### Apply median from training set to null values of test set ###
X_test_features = transform.transform(X_test)

X_test = pd.DataFrame(X_test_features, index = X_test.index, columns = X_test.columns)

In [15]:
X_train.shape

(1498, 25)

In [16]:
X_test.shape

(480, 25)

## Logistic
The simplest example of logistic model used for machine learning

In [17]:
#step 1 import model you want to use
from sklearn.linear_model import LogisticRegression

In [18]:
#Step 2 Make an instance of the Model
logisticRegr = LogisticRegression()

In [19]:
# Step 3. Training the model on the data, storing the information learned from the data
logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
# Step 4. Predict labels for new data
predictions = logisticRegr.predict(X_test)

In [21]:
# Use score method to get accuracy of model 
score = logisticRegr.score(X_test, y_test)
print(score)

0.522916666667


## Random Forest

Another commonly used machine learning model with hyperparameter tuning for price trend prediction

Hyperparameter tuning process is to optimize the random forest model by trying many different combinations evaluate the performance of each model using Scikit-Learn tools

In [22]:
# Train and fit classifier to Training Data
scores = {}
#Initialize parameter tracking
highest_score = 0
best_ne  = 0
best_md = 0
best_mln = 0
for ne in range(100, 600, 100):
    for md in range(3, 11):
        for mln in range(4, 100, 4):
            clf = RandomForestClassifier(n_estimators=ne, max_depth=md, max_leaf_nodes=mln, n_jobs=-1, verbose=0)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_pred.sum()
            metric = roc_auc_score(y_test, y_pred)
            
            #Set values for best parameters if new high auc_score
            if(metric > highest_score):
                highest_score = metric
                best_ne = ne
                best_md = md
                best_mln = mln
            
            scores[str(ne)+'_'+str(md)+'_'+str(mln)] = metric
            print(metric)

            ## binary classification 
            ## market data is really difficult to make these predictions, almost never get better than 75% accuracy. In Dr.Trevor Trinkino's prediction, he got accuracy around 50% as well.
            ## Curve: look at percetage of true positives and false negatives            
            ## another way is to look at correct proportion of prediction             
            ## there are other ways to search for hyperparameter more efficiently using a genetic algorithm or other similar techniques 


0.502948903078
0.491492545103
0.49091842238
0.494980775588
0.499538962056
0.491188086084
0.491605629882
0.497355555942
0.500956871205
0.503444736338
0.497164181701
0.497164181701
0.493371492197
0.49727726648
0.495172149829
0.501530993928
0.495746272552
0.494980775588
0.497546930183
0.488926390508
0.507507089546
0.498808260408
0.473564258251
0.496737939073
0.494980775588
0.494980775588
0.491379460325
0.490344299657
0.511717322848
0.517197585205
0.504210233303
0.489387428452
0.50191374241
0.491379460325
0.522982306582
0.525739835418
0.48900467997
0.507046051601
0.493180117956
0.48900467997
0.494980775588
0.48900467997
0.504862645488
0.493180117956
0.500495833261
0.496894517998
0.510264618382
0.490996711843
0.495172149829
0.499347587815
0.50191374241
0.489500513231
0.511030115346
0.501835452948
0.487204022339
0.494980775588
0.499156213574
0.498312427147
0.526313958141
0.495554898311
0.505323683432
0.521103359488
0.490996711843
0.502679239374
0.49536352407
0.514631430609
0.512448024496
0.4

0.517006210964
0.500113084779
0.494980775588
0.48900467997
0.509690495659
0.495554898311
0.500495833261
0.504671271247
0.497929678665
0.511412863829
0.508463960751
0.497738304424
0.510455992623
0.495172149829
0.515205553333
0.512639398737
0.515014179091
0.534664834113
0.525548461177
0.525165712695
0.497355555942
0.516623462482
0.521373023191
0.530489396127
0.528114615773
0.492988743715
0.499538962056
0.497546930183
0.500495833261
0.501722368169
0.499730336297
0.499269298352
0.495554898311
0.50191374241
0.50191374241
0.493371492197
0.498503801388
0.494328363402
0.489578802693
0.497738304424
0.497738304424
0.499538962056
0.489387428452
0.507237425842
0.500417543799
0.503905774283
0.497851389203
0.506280554637
0.491379460325
0.49536352407
0.494980775588
0.495172149829
0.494980775588
0.498121052906
0.515205553333
0.491379460325
0.511221489588
0.503714400042
0.505054019729
0.507046051601
0.509229457715
0.502296490892
0.487778145062
0.496129021034
0.50030445902
0.491570834566
0.497546930183


In [30]:
#Show the best parameters 
print ("highest_score:" , highest_score, " best_ne:", best_ne, "best_md:", best_md, " best_mln:", best_mln)

highest_score: 0.541597800936  best_ne: 100 best_md: 8  best_mln: 84


In [31]:

# A close look at feature importances for our previous feature engineering 
clf.feature_importances_

array([ 0.02709706,  0.03530728,  0.02912581,  0.0334325 ,  0.0248134 ,
        0.10931424,  0.05964086,  0.02818211,  0.03540075,  0.02764748,
        0.04401503,  0.04449535,  0.03838879,  0.03498374,  0.03167232,
        0.07401014,  0.04591856,  0.01454543,  0.02436758,  0.05240123,
        0.03787222,  0.03696109,  0.037079  ,  0.03937481,  0.03395323])

In [32]:
#Plot of feature importances 
feature_importances = pd.Series(clf.feature_importances_, index = X_train.columns)
fi = feature_importances.sort_values()
fi.plot(kind = "barh", figsize = (7,6))

<matplotlib.axes._subplots.AxesSubplot at 0x293cfc3f2b0>