In [221]:
### Core Packages
import pandas as pd
import numpy as np

### Visualization Packages
import matplotlib.pyplot as plt

### Machine Learning Packages
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

### Setting env
import os, sys

folder = os.getcwd()
if(not(folder.endswith('tcc-machine-learning'))):
        os.chdir('..')
folder = os.getcwd()

### Functions
from dags import config
from dags.utils import generate_label

### Others
import warnings

In [2]:
def Average(lst):
    return sum(lst) / len(lst)

In [3]:
ticker = 'petr4.sa'
df_ticker = pd.read_csv(f'data/processed/{ticker}_processed.csv', encoding='utf8', delimiter=',')
df_ticker['date'] = pd.to_datetime(df_ticker['date'])

In [4]:
days = 3
df_ticker_target = df_ticker.loc[df_ticker['date'] >= '2015-01-01']
df_train = generate_label(days, df_ticker_target)

In [5]:
df_train = df_train.set_index('date')

transfor = 'normal'
dir_func = np.log if transfor == 'log' else lambda x:x
inf_func = np.exp if transfor == 'log' else lambda x:x

cols_to_transform = [col for col in df_train.columns if not 'target' in col]
df_train[cols_to_transform] = dir_func(df_train[cols_to_transform])


X = df_train.drop(columns = 'target')
y = df_train['target']
feature_names = X.columns

st_feat = MinMaxScaler()
X = X.sort_index(axis = 1)
X = st_feat.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [6]:
### Feature Selection
extra_tree_forest = ExtraTreesClassifier(n_estimators = 400,
                                         criterion ='entropy',
                                         bootstrap=True,
                                         oob_score=True,
                                         min_samples_leaf=2,
                                         n_jobs=4)
  
extra_tree_forest.fit(X, y)
  
rfe = RFECV(extra_tree_forest, cv=5, scoring="neg_mean_squared_error")
rfe.fit(X_train,y_train)
selected_features = list(np.array(feature_names)[rfe.get_support()])
df_selected = df_train[selected_features + ['target']]


In [204]:
names = [
    "Nearest Neighbors",
    "RBF SVM",
    "Gaussian Process",
    "Random Forest",
    "Extra Trees",
    "Neural Net",
    "AdaBoost",
]

classifiers = [
    KNeighborsClassifier(n_neighbors = 5, weights = "distance", p = 1),
    SVC(kernel= "rbf", gamma = 3.5, C = 1000),
    GaussianProcessClassifier(1*RBF(2.0)),
    RandomForestClassifier(n_estimators=400, min_samples_leaf=2, oob_score=True, bootstrap=True, n_jobs=4),
    ExtraTreesClassifier(n_estimators = 400, criterion ='entropy', min_samples_leaf=2, n_jobs=4),
    MLPClassifier(max_iter = 80000, activation = 'tanh', solver = 'lbfgs', alpha = 0.0001, learning_rate = 'constant'),
    AdaBoostClassifier(n_estimators = 1000, base_estimator = DecisionTreeClassifier(max_depth=9, min_samples_leaf = 2))
]

In [211]:
X = df_selected.drop(columns = 'target')
y = df_selected['target']
feature_names = X.columns

st_feat = MinMaxScaler()
X = X.sort_index(axis = 1)
X = st_feat.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

acc_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=42)

for alg in classifiers:
    
    alg_accuracy = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        
        alg.fit(X_train, y_train)
        
        acc = accuracy_score(y_test, alg.predict(X_test))
    
        alg_accuracy.append(acc)
    
    mean_acc = Average(alg_accuracy)
    acc_list.append(mean_acc)

acc_list

[0.6094736842105263]

In [215]:
classifiers = [
    MLPClassifier(max_iter = 10000, activation = 'tanh', solver = 'lbfgs', alpha = 0.0001, learning_rate = 'constant')
]

X = df_selected.drop(columns = 'target')
y = df_selected['target']
feature_names = X.columns

st_feat = MinMaxScaler()
X = X.sort_index(axis = 1)
X = st_feat.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

acc_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=42)

for alg in classifiers:
    
    alg.fit(X_train, y_train)
    
    acc = accuracy_score(y_test, alg.predict(X_test))

    acc_list.append(acc)

acc_list

[0.5768421052631579]

In [220]:
classifiers = [
    ("KNC", KNeighborsClassifier(n_neighbors = 5, weights = "distance", p = 1)),
    ("SVC", SVC(kernel= "rbf", gamma = 3.5, C = 1000)),
    ("GC", GaussianProcessClassifier(1*RBF(2.0))),
    ("RF", RandomForestClassifier(n_estimators=400, min_samples_leaf=2, oob_score=True, bootstrap=True, n_jobs=4)),
    ("MLP", MLPClassifier(max_iter = 8000, activation = 'tanh', solver = 'lbfgs', alpha = 0.0001, learning_rate = 'constant')),
    ("ADA", AdaBoostClassifier(n_estimators = 1000, base_estimator = DecisionTreeClassifier(max_depth=9, min_samples_leaf = 2)))
]

X = df_selected.drop(columns = 'target')
y = df_selected['target']
feature_names = X.columns

st_feat = MinMaxScaler()
X = X.sort_index(axis = 1)
X = st_feat.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

clf = StackingClassifier(estimators=classifiers, final_estimator = ExtraTreesClassifier(n_estimators = 400, criterion ='entropy', min_samples_leaf=2, n_jobs=4))

clf.fit(X_train, y_train).score(X_test, y_test)

0.6863157894736842

In [257]:
accuracy_score(y_test, clf.predict(X_test))

0.9242105263157895

In [223]:
classifiers = [
    ("KNC", KNeighborsClassifier(n_neighbors = 5, weights = "distance", p = 1)),
    ("SVC", SVC(kernel= "rbf", gamma = 3.5, C = 1000)),
    ("GC", GaussianProcessClassifier(1*RBF(2.0))),
    ("EXTRA", ExtraTreesClassifier(n_estimators = 400, criterion ='entropy', min_samples_leaf=2, n_jobs=4)),
    ("RF", RandomForestClassifier(n_estimators=400, min_samples_leaf=2, oob_score=True, bootstrap=True, n_jobs=4)),
    ("MLP", MLPClassifier(max_iter = 8000, activation = 'tanh', solver = 'lbfgs', alpha = 0.0001, learning_rate = 'constant')),
    ("ADA", AdaBoostClassifier(n_estimators = 1000, base_estimator = DecisionTreeClassifier(max_depth=9, min_samples_leaf = 2)))
]

X = df_selected.drop(columns = 'target')
y = df_selected['target']
feature_names = X.columns

st_feat = MinMaxScaler()
X = X.sort_index(axis = 1)
X = st_feat.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

eclf1 = VotingClassifier(estimators = classifiers, voting='hard')
eclf1 = eclf1.fit(X, y)
hard = accuracy_score(y_test, eclf1.predict(X_test))

In [254]:
hard

1.0

In [251]:

eclf3 = VotingClassifier(estimators = classifiers, voting='hard', weights=[1, 2, 1, 3, 3, 2, 1],  flatten_transform=True)
eclf3 = eclf3.fit(X, y)
weighted2 = accuracy_score(y_test, eclf3.predict(X_test))

In [253]:
weighted2

1.0

In [263]:
classifiers = [
    ("KNC", KNeighborsClassifier(n_neighbors = 5, weights = "distance", p = 1)),
    ("GC", GaussianProcessClassifier(1*RBF(2.0))),
    ("EXTRA", ExtraTreesClassifier(n_estimators = 400, criterion ='entropy', min_samples_leaf=2, n_jobs=4)),
    ("RF", RandomForestClassifier(n_estimators=400, min_samples_leaf=2, oob_score=True, bootstrap=True, n_jobs=4)),
    ("MLP", MLPClassifier(max_iter = 8000, activation = 'tanh', solver = 'lbfgs', alpha = 0.0001, learning_rate = 'constant')),
    ("ADA", AdaBoostClassifier(n_estimators = 1000, base_estimator = DecisionTreeClassifier(max_depth=9, min_samples_leaf = 2)))
]

X = df_selected.drop(columns = 'target')
y = df_selected['target']
feature_names = X.columns

st_feat = MinMaxScaler()
X = X.sort_index(axis = 1)
X = st_feat.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

eclf2 = VotingClassifier(estimators = classifiers, voting='soft')
eclf2 = eclf2.fit(X, y)
soft = accuracy_score(y_test, eclf2.predict(X_test))

In [265]:
soft

1.0

In [267]:
eclf4 = VotingClassifier(estimators = classifiers, voting='hard', weights=[1, 2, 1, 3, 1, 1],  flatten_transform=True)
eclf4 = eclf4.fit(X, y)
weighted = accuracy_score(y_test, eclf4.predict(X_test))

In [None]:
weighted2

In [None]:
print(hard)
print(soft)
print(weighted)