In [4]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn import neighbors, svm, model_selection, metrics
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_validate, train_test_split
from sklearn.utils import resample
from tqdm import tqdm
import datetime as dt
from technical_indicators import *


pd.set_option("display.max_columns", 15)
pd.set_option("display.width", 500)

# Variables
moving_window = 1000
test_len = 10


def SMA_classification(df, n):
    df['Target ma'] = df['Price'].rolling(window=n, min_periods=n).mean()
    df['Target ma'] = df['Target ma'].round(4)
    df.dropna(inplace=True)
    df['Target'] = np.where(df['Target ma'.format(n)].shift(-n) > df['Target ma'], 1, -1)
    df.drop(columns=['Target ma'], inplace=True)
    df.dropna(inplace=True)

    return df

def data_processing(df):
    
    df = df.replace([np.inf, -np.inf], np.nan)
    # Drop na, nan and nat rows
    df.dropna(inplace=True)
    
    tmp_df = df.copy()
    tmp_df.drop(columns=['Target'], inplace=True)
    
    X = tmp_df.values
    y = df['Target'].values

    return X, y, df


def feature_comparison(original_df, moving_window, test_len):
    
    clf1 = neighbors.KNeighborsClassifier()
    clf2 = svm.SVC(kernel='rbf', C=10)
    clf3 = svm.SVC(kernel='linear', C=10)
    clf4 = RandomForestClassifier(random_state=0)
    clf5 = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier(random_state=0))])
    
    clf_list = [clf1, clf2, clf3, clf4, clf5]
    clf_name_list = ['KNN', 'SVM.SVC.RBF', 'SVM.SVC.Linear', 'RandomForestClassifier', 'VotingClassifier']
    
    accuracy_list = []
    
    for i in range(len(clf_list)):

        df = original_df.copy()
        X, y, df = data_processing(df)
        
        correct = wrong = 0
        
        for j in tqdm(range(X.shape[0]-moving_window-test_len)):

            X_train, X_test = X[np.arange(j,j+moving_window)], X[np.arange(j+moving_window,j+moving_window+test_len)]
            y_train, y_test = y[np.arange(j,j+moving_window)], y[np.arange(j+moving_window,j+moving_window+test_len)]
            
            if len(set(y_train)) == 1:
                for e in y_test:
                    if e == y_train[0]:
                        correct += 1
                    else:
                        wrong += 1
            else:
                clf_list[i].fit(X_train, y_train)
                # Gives deprecation warning here
                accuracy = clf_list[i].score(X_test, y_test)
                prediction = clf_list[i].predict(X_test)

                if accuracy == 1.0:
                    correct += 1
                elif accuracy == 0.0:
                    wrong += 1

            accuracy_list.append(round(correct/(correct+wrong), 3))
        
        print('Mean accuracy of {}: {}'.format(clf_list[i], round(np.mean(accuracy_list), 3)))

        
def build_model(clf, X_train, y_train, X_test, y_test):
    
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    prediction = clf.predict(X_test)
    
    return accuracy, prediction


In [5]:
df = pd.read_csv('Data/EUR_USD_Daily.csv', index_col=0)

# Create labels for training
df = SMA_classification(df, 5)

df.drop(columns=['Change'], inplace=True)

df = moving_average(df, 5)
df = moving_average(df, 8)
df = moving_average(df, 10)
df = moving_average(df, 13)
df = moving_average(df, 15)
df = moving_average(df, 20)
df = moving_average(df, 21)
df.dropna(inplace=True)

In [6]:
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
feature_comparison(df, moving_window, test_len)

100%|█████████████████████████████████████████████████| 3918/3918 [00:08<00:00, 470.44it/s]


Mean accuracy of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'): 0.762


100%|██████████████████████████████████████████████████| 3918/3918 [02:04<00:00, 31.08it/s]


Mean accuracy of SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False): 0.699


100%|██████████████████████████████████████████████████| 3918/3918 [01:10<00:00, 55.29it/s]


Mean accuracy of SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False): 0.774


100%|██████████████████████████████████████████████████| 3918/3918 [01:54<00:00, 34.36it/s]


Mean accuracy of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False): 0.782


100%|██████████████████████████████████████████████████| 3918/3918 [04:20<00:00, 18.16it/s]


Mean accuracy of VotingClassifier(estimators=[('lsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, m...estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None): 0.796
