#### This notebook is about analysing how models perform using different sets of features and machine learning techniques after obtaining the optimal hyperparameters settings for the machine learning techniques. The target is smoothed. (Currently set to 5 days average)

In [410]:
%matplotlib inline
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import warnings
from collections import Counter
from sklearn import neighbors, svm, model_selection, metrics, linear_model
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_validate, train_test_split
from sklearn.utils import resample
from sklearn import preprocessing
from tqdm import tqdm
import datetime as dt
import mpld3
from collections import Counter

mpld3.enable_notebook()
from statistical_metrics import *
from technical_indicators import *

In [411]:
pd.set_option("display.max_columns", 15)
pd.set_option("display.width", 500)

warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [412]:
def classification(df, n):
    df['Next_change'] = df['Change'].shift(-n)
    df.dropna(inplace=True)
    df['Target'] = np.where(df['Next_change'] > 0.0, 1, -1)
    df.drop(columns='Next_change', inplace=True)

    return df

def SMA_classification(df, n):
    df['Target ma'] = df['Price'].rolling(window=n, min_periods=n).mean()
    df['Target ma'] = df['Target ma'].round(4)
    df.dropna(inplace=True)
    df['Target'] = np.where(df['Target ma'.format(n)].shift(-n) > df['Target ma'], 1, -1)
    df.drop(columns=['Target ma'], inplace=True)
    df.dropna(inplace=True)

    return df

df = pd.read_csv('Data/EUR_USD_1418.csv', index_col=0)

look_ahead = 10
# Create labels for training
df = SMA_classification(df, look_ahead)

df.dropna(inplace=True)

df.drop(columns=['Change'], inplace=True)

df = moving_average(df, 5)
df = moving_average(df, 8)
df = moving_average(df, 10)
df = moving_average(df, 13)
df = moving_average(df, 15)
df = moving_average(df, 20)
df = moving_average(df, 21)
df = EMA(df, 5)
df = EMA(df, 10)
df = EMA(df, 15)
df = EMA(df, 20)
df = MACD(df, 12, 26, 9)
df = MACD(df, 21, 55, 13)
df = stochastic_oscillator(df, 5)
df = stochastic_oscillator(df, 8)
df = stochastic_oscillator(df, 10)
df = stochastic_oscillator(df, 13)
df = stochastic_oscillator(df, 15)
df = stochastic_oscillator(df, 20)
df = stochastic_oscillator(df, 21)
df = williams_R(df, 5)
df = williams_R(df, 7)
df = williams_R(df, 8)
df = williams_R(df, 10)
df = williams_R(df, 13)
df = williams_R(df, 14)
df = williams_R(df, 15)
df = williams_R(df, 20)
df = williams_R(df, 21)
df = relative_strength_index(df, 5)
df = relative_strength_index(df, 7)
df = relative_strength_index(df, 8)
df = relative_strength_index(df, 10)
df = relative_strength_index(df, 13)
df = relative_strength_index(df, 14)
df = relative_strength_index(df, 15)
df = relative_strength_index(df, 20)
df = relative_strength_index(df, 21)
df = momentum(df, 5)
df = momentum(df, 7)
df = momentum(df, 8)
df = momentum(df, 10)
df = momentum(df, 13)
df = momentum(df, 14)
df = momentum(df, 15)
df = momentum(df, 20)
df = momentum(df, 21)
df = change_rate(df, 1)
df = change_rate(df, 2)
df = change_rate(df, 3)
df = change_rate(df, 4)
df = change_rate(df, 5)
df = average_true_range(df, 7)
df = average_true_range(df, 14)
df = average_true_range(df, 21)
df = average_directional_index_and_DI(df, 5)
df = average_directional_index_and_DI(df, 7)
df = average_directional_index_and_DI(df, 8)
df = average_directional_index_and_DI(df, 10)
df = average_directional_index_and_DI(df, 13)
df = average_directional_index_and_DI(df, 14)
df = average_directional_index_and_DI(df, 15)
df = average_directional_index_and_DI(df, 20)
df = average_directional_index_and_DI(df, 21)
df.dropna(inplace=True)

print('Classification number:', Counter(df['Target']))
target_counts = dict(Counter(df['Target']).items())
print('Downtrend:', round(target_counts.get(-1)/df['Target'].shape[0], 3))
print('Uptrend:', round(target_counts.get(1)/df['Target'].shape[0], 3))

Classification number: Counter({-1: 680, 1: 544})
Downtrend: 0.556
Uptrend: 0.444


In [413]:
# Feature sets

def feature_set1(df):
    new_df = df[['%R W10', '%R W8', '%K_W5', '%K_W8', '%R W15', '%R W5', '%R W7', '+DI 5', '%K_W10', 
                 '%R W13', 'change_rate 4', '%K_W15', 'Target']].copy()
        
    return new_df

def feature_set2(df):
    new_df = df[['change_rate 3', 'MACD_12,26,9', 'momentum 13', 'change_rate 4', '%D_W8', 'RSI 21', '%K_W5', '-DI 14', 
                 '-DI 13', 'RSI 20', '-DI 15', 'change_rate 2', 'ADX 15', 'ATR_scaled 21', 'ATR 21', 'Target']].copy()
    
    return new_df

def feature_set3(df):
    new_df = df[['21ma', 'ADX 21', 'ATR 21', 'ADX 20', 'High', '10ma', 'MACD_21,55,13', '%K_W5', '%R W5', 'change_rate 2', 
                 'change_rate 3', '-DI 15', 'Target']].copy()
    
    return new_df

# def feature_set1(df):
#     new_df = df[['%K_W5', '%R W5', '%D_W5', '%R W8', 'momentum 5', 'change_rate 3', 'RSI 13', '%K_W8', 'change_rate 4', 
#                  '%R W10', 'Target']].copy()
        
#     return new_df

# def feature_set2(df):
#     new_df = df[['change_rate 3', 'change_rate 4', 'ATR_scaled 21', 'ATR 21', 'change_rate 1', 'Low', '+DI 21', 
#                  'MACD_21,55,13', '%D_W20', 'change_rate 5', 'Target']].copy()
        
#     return new_df

# def feature_set3(df):
#     new_df = df[['change_rate 3', 'ATR 21', 'ADX 10', 'MACD_21,55,13', 'change_rate 4', '5ma', 'change_rate 1', 
#                  'ATR 7', 'ATR_scaled 21', 'ADX 5', 'Target']].copy()
        
#     return new_df

def feature_set4(df):
    new_df = df[['Open', 'High', 'Low', 'Price', '5ma', '8ma', '10ma', '13ma', '15ma', '20ma', '21ma', 
                 'Target']].copy()
    
    return new_df

def feature_set5(df):
    new_df = df[['Open', 'High', 'Low', 'Price', 'ema5', 'ema10', 'ema15', 'ema20', 'Target']].copy()
    
    return new_df

def feature_set6(df):
    new_df = df[['%R W5', '%K_W5', '%R W7', '%R W8', '%K_W8', '%R W10', '%K_W10', 'Target']].copy()
        
    return new_df


def feature_set7(df):
    new_df = df[['Open', 'High', 'Low', 'Price', 'MACD_12,26,9', 'Target']].copy()
        
    return new_df


def feature_set8(df):
    new_df = df[['%K_W5', '%D_W5', '%D_W13', '%D_W15', '%R W5', '%R W21', 'momentum 5', 'momentum 13', 
                 'momentum 20', 'momentum 21', 'change_rate 1', 'change_rate 2', 'change_rate 3', 
                 'change_rate 4', 'ATR_scaled 7', '-DI 5', '-DI 7', '+DI 10', '-DI 20', 'ADX 21', 'Target']].copy()
    
    return new_df


switch = {
    'feature_set1': feature_set1,
    'feature_set2': feature_set2,
    'feature_set3': feature_set3,
    'test_set1': feature_set4,
    'test_set2': feature_set5,
    'test_set3': feature_set6,
    'test_set4': feature_set7,
    'test_set5': feature_set8
}

In [414]:
def feature_selector(df, feature_set):
    
    df = switch[feature_set](df)
#     df.dropna(inplace=True)
    return df

In [415]:
def data_processing(df, feature_set):
    df = feature_selector(df, feature_set)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    # Drop na, nan and nat rows
    df.dropna(inplace=True)
    
    tmp_df = df.copy()
    tmp_df.drop(columns=['Target'], inplace=True)
    
    X = tmp_df.values
    y = df['Target'].values

    return X, y, df

In [416]:
def build_model(clf, X_train, y_train, X_test, y_test):
    
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    prediction = clf.predict(X_test)
    
    return accuracy, prediction


In [417]:
def feature_comparison(original_df, split_pct, feature_list):
    
    clf1 = neighbors.KNeighborsClassifier()
    clf2 = svm.SVC(kernel='rbf', C=10)
    clf3 = svm.SVC(kernel='linear', C=10)
    clf4 = RandomForestClassifier(random_state=0)
    clf5 = VotingClassifier([('lsvc', svm.SVC(kernel='linear', C=10)),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])
    clf6 = VotingClassifier([('rsvc', svm.SVC(kernel='rbf', C=10)),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])
    clf7 = VotingClassifier([('lsvc', svm.SVC(kernel='linear', C=10)),
                            ('rfor', RandomForestClassifier())])
    clf8 = VotingClassifier([('rsvc', svm.SVC(kernel='rbf', C=10)),
                            ('rfor', RandomForestClassifier())])
    
#     clf1 = VotingClassifier([('lsvc', svm.SVC(kernel='linear', C=10)),
#                             ('knn', neighbors.KNeighborsClassifier()),
#                             ('rfor', RandomForestClassifier())])
#     clf2 = svm.SVC(kernel='rbf', C=10)
#     clf3 = RandomForestClassifier(random_state=0)
    
    
    clf_list = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8]
#     clf_list = [clf1, clf2, clf3]
    clf_name_list = ['KNN', 'SVM.SVC.RBF', 'SVM.SVC.Linear', 'RandomForestClassifier', 'VotingClassifier(knn,lsvc,rfor)', 
                     'VotingClassifier(knn,rsvc,rfor)', 'VotingClassifier(lsvc,rfor)', 'VotingClassifier(rsvc,rfor)']
    
#     clf_name_list = ['VotingClassifier(knn,lsvc,rfor)', 'SVM.SVC.RBF', 'RandomForestClassifier']
    
    accuracy_list = []
    
    for n in range(len(feature_list)):
        for i in range(len(clf_list)):

            df = original_df.copy()
            X, y, df = data_processing(df, feature_list[n])

            split_pct = split_pct
            train_size = int(X.shape[0] * split_pct)

            X_train, X_test = X[0:train_size], X[train_size:]
            y_train, y_test = y[0:train_size], y[train_size:]

            accuracy, prediction = build_model(clf_list[i], X_train, y_train, X_test, y_test)

            print('clf={} feature_set={}'.format(clf_name_list[i], feature_list[n]))
            print('Accuracy:', round(accuracy, 4))
            print("Confusion matrix: ")
            print('{}\n'.format(metrics.confusion_matrix(y_test, prediction)))

            accuracy_list.append([round(accuracy, 4), 
                                  'clf={}, feature_set={}'.format(clf_name_list[i], feature_list[n])])

    accuracy_list = list(reversed(sorted(accuracy_list)))
    print('Highest accuracy: {}, Lowest accuracy: {}'.format(accuracy_list[0][0], accuracy_list[-1][0]))
    selected_model = round(0.2 * len(accuracy_list))
    for i in range(selected_model):
        print('Name: {}, Accuracy: {}'.format(accuracy_list[i][1], accuracy_list[i][0]))

In [418]:
# feature_list = ['feature_set1', 'feature_set2', 'feature_set3', 'test_set1', 'test_set2', 'test_set3', 
#                 'test_set4', 'test_set5']
feature_list = ['feature_set1', 'feature_set2', 'feature_set3']
# feature_list = ['feature_set1']
feature_comparison(df, 0.8, feature_list)


clf=KNN feature_set=feature_set1
Accuracy: 0.6163
Confusion matrix: 
[[96 64]
 [30 55]]

clf=SVM.SVC.RBF feature_set=feature_set1
Accuracy: 0.6531
Confusion matrix: 
[[98 62]
 [23 62]]

clf=SVM.SVC.Linear feature_set=feature_set1
Accuracy: 0.6653
Confusion matrix: 
[[105  55]
 [ 27  58]]

clf=RandomForestClassifier feature_set=feature_set1
Accuracy: 0.6327
Confusion matrix: 
[[114  46]
 [ 44  41]]

clf=VotingClassifier(knn,lsvc,rfor) feature_set=feature_set1
Accuracy: 0.6612
Confusion matrix: 
[[104  56]
 [ 27  58]]

clf=VotingClassifier(knn,rsvc,rfor) feature_set=feature_set1
Accuracy: 0.6776
Confusion matrix: 
[[106  54]
 [ 25  60]]

clf=VotingClassifier(lsvc,rfor) feature_set=feature_set1
Accuracy: 0.6735
Confusion matrix: 
[[122  38]
 [ 42  43]]

clf=VotingClassifier(rsvc,rfor) feature_set=feature_set1
Accuracy: 0.6816
Confusion matrix: 
[[116  44]
 [ 34  51]]

clf=KNN feature_set=feature_set2
Accuracy: 0.4939
Confusion matrix: 
[[77 83]
 [41 44]]

clf=SVM.SVC.RBF feature_set=featu