In [4]:
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [5]:
with open("synthetic_data.pkl" , 'rb') as f:
    dataset = pickle.load(f)

In [6]:
features, labels = dataset

In [7]:
np.unique(labels, return_counts=True)

(array(['Advance Block', 'Engulfing Bearish', 'Engulfing Bullish',
        'Evening Star', 'Hammer', 'Hanging Man', 'Morning Star'],
       dtype='<U17'),
 array([  47, 2110, 1880,    5,  282,  658,    7]))

In [8]:
X = features.reshape(features.shape[0], -1)

In [9]:
y = (labels == 'Engulfing Bearish').astype(int)

In [10]:
scaler = MinMaxScaler()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3741, 44), (1248, 44), (3741,), (1248,))

In [12]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
with open("scaler.pkl", 'wb') as f:
    pickle.dump(scaler, f)

In [13]:
clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)

pred = clf.predict(X_test)
pred_train = clf.predict(X_train)

accuracy_score(y_test, pred), accuracy_score(y_train, pred_train)

(0.8461538461538461, 0.9906442127773323)

In [14]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.677 total time=   1.1s
[CV 4/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.655 total time=   1.0s
[CV 1/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.677 total time=   2.2s
[CV 3/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.735 total time=   2.3s
[CV 5/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=200;, score=0.664 total time=   2.3s
[CV 2/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=300;, score=0.721 total time=   3.2s
[CV 4/5] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=300;, score=0.659 total time=   3.3s
[CV 1/5] END max_depth=5, max_features=sqrt, min_samples_split=5, n_estimators=100;, score=0.672 total time=   1.1s
[CV 2/5] E

  _data = np.array(data, dtype=dtype, copy=copy,


{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}


In [15]:
print(grid_search.best_params_)

{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}


In [16]:
best_model = grid_search.best_estimator_

In [17]:
pred = best_model.predict(X_test)
accuracy_score(y_test, pred)

0.8189102564102564

In [18]:
train_pred = best_model.predict(X_train)
accuracy_score(y_train, train_pred)

0.967121090617482

In [19]:
with open("engulfing_bearish_rf.pickle", 'wb') as f:
    pickle.dump(best_model, f)

[CV 1/5] END max_depth=15, max_features=sqrt, min_samples_split=5, n_estimators=200;, score=0.789 total time=   4.0s
[CV 3/5] END max_depth=15, max_features=sqrt, min_samples_split=5, n_estimators=200;, score=0.813 total time=   4.3s
[CV 5/5] END max_depth=15, max_features=sqrt, min_samples_split=5, n_estimators=200;, score=0.810 total time=   4.2s
[CV 2/5] END max_depth=15, max_features=sqrt, min_samples_split=5, n_estimators=300;, score=0.810 total time=   6.4s
[CV 4/5] END max_depth=15, max_features=sqrt, min_samples_split=5, n_estimators=300;, score=0.766 total time=   6.5s
[CV 1/5] END max_depth=15, max_features=log2, min_samples_split=2, n_estimators=100;, score=0.798 total time=   1.9s
[CV 2/5] END max_depth=15, max_features=log2, min_samples_split=2, n_estimators=100;, score=0.813 total time=   1.9s
[CV 4/5] END max_depth=15, max_features=log2, min_samples_split=2, n_estimators=100;, score=0.762 total time=   1.8s
[CV 1/5] END max_depth=15, max_features=log2, min_samples_split=