Обучаем модель на нескольких классификаторах простых алгоритмов. Фичи взяты из системы обнаружения свечных паттернов.

In [34]:
# Импортируем модули
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas_ta as ta
import mplfinance as mpf

In [35]:
# Загружаем данные
# Volatility 10 Index_H1_201901010500_202109010000.csv
df = pd.read_csv('~/Python/projects/mlfin/data/Volatility 10 Index_H1_201901010500_202109010000.csv',
                 encoding='utf-8',
                 index_col=False,
                 skiprows=1,
                 header= None,
                 names= ['date','time','open', 'high', 'low', 'close', 'tv', 'spread'],
                 sep='\t')

df['datetime'] = df['date'] + ' ' + df['time']

df.drop(['date', 'time', 'tv', 'spread'], axis= 1, inplace=True)

df.reset_index(drop=True, inplace=True)
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.set_index('datetime')

  return func(*args, **kwargs)


In [36]:
df.head()

Unnamed: 0_level_0,open,high,low,close
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01 05:00:00,7989.803,7990.783,7981.851,7985.697
2019-01-01 06:00:00,7985.807,7994.125,7984.129,7985.746
2019-01-01 07:00:00,7985.621,7991.298,7979.697,7985.078
2019-01-01 08:00:00,7984.991,7999.997,7983.987,7989.836
2019-01-01 09:00:00,7989.806,7992.654,7983.357,7985.492


In [43]:
def add_features(data, dround, barrier, expiration):
    dfd = data.copy()
    
    # Вычисляем преобладающий тренд (по индикатору SuperTrend(12, 1.3))
    dfd.ta.supertrend(high=dfd['high'], low=dfd['low'], close=dfd['close'], length=12, multiplier=1.3, append=True)
    dfd = dfd.drop(['SUPERT_12_1.3', 'SUPERTl_12_1.3', 'SUPERTs_12_1.3'], axis=1)
    dfd = dfd.rename({'SUPERTd_12_1.3':'f_trend'}, axis=1)

    # Вычисляем цвет 0, 1 и 2-й свечи (нулевая свеча - текущая, первая свеча - предыдущая нулевой и тп)
    dfd['f_can_col'] = np.where(dfd['close'] - dfd['open'] >= 0, 1, 0)
    dfd['f_can_col_1'] = np.where(dfd['close'].shift(1) - dfd['open'].shift(1) >= 0, 1, 0)
    dfd['f_can_col_2'] = np.where(dfd['close'].shift(2) - dfd['open'].shift(2) >= 0, 1, 0)

    # Вычисляем размеры 1, 2 и 3-ей свечи (high-low), нормализованные на размеры самой большой свечи из трех
    dfd['f_can_size'] = round(dfd['high'] - dfd['low'], dround)
    dfd['f_can_size_norm_max'] = round(dfd['f_can_size'] / dfd['f_can_size'].rolling(3).max(), dround)
    dfd['f_can_size_norm_max_1'] = round(dfd['f_can_size'].shift(1) / dfd['f_can_size'].shift(1).rolling(3).max(), dround)
    dfd['f_can_size_norm_max_2'] = round(dfd['f_can_size'].shift(2) / dfd['f_can_size'].shift(2).rolling(3).max(), dround)

    # Вычисляем соотношения размеров свечей
    # Соотношение размеров 2-й и 1-й свечи (Соотношение - здесь деление)
    dfd['f_size_ratio_10'] = round(dfd['f_can_size'].shift(1)/dfd['f_can_size'], dround)
    # Соотношение размеров 3-й и 1-й свечи
    dfd['f_size_ratio_20'] = round(dfd['f_can_size'].shift(2)/dfd['f_can_size'], dround)
    # Соотношение размеров 3-й и 2-й свечи
    dfd['f_size_ratio_21'] = round(dfd['f_can_size'].shift(2)/dfd['f_can_size'].shift(1), dround)

    # Вычисляем разрывы между свечами
    # Разрыв между 1-й и 2-й свечами (между low текущей свечи и high предыдущей на растущем тренде
    # и между high текущей свечи и low предыдущей при убывающем тренде)
    dfd['f_gap_01'] = np.where(
        dfd['f_trend'] ==1, round(dfd['low'] - dfd['high'].shift(1), dround),
        round(dfd['high']-dfd['low'].shift(1), dround))

    # Разрыв между 1-й и 3-ей свечами
    dfd['f_gap_02'] = np.where(
        dfd['f_trend'] == 1, round(dfd['low'] - dfd['high'].shift(2), dround),
        round(dfd['high'] - dfd['low'].shift(2), dround))
    # Разрыв между 2-й и 3-й свечами
    dfd['f_gap_12'] = np.where(
        dfd['f_trend'] == 1, round(dfd['low'].shift(1) - dfd['high'].shift(2), dround),
        round(dfd['high'].shift(1) - dfd['low'].shift(2), dround))

    # Вычисляем разрывы между телами свечей
    # Разрыв реального тела между 1-й и 2-й свечами (между open текущей свечи и close предыдущей свечи)
    dfd['f_gap_body_01'] = round(dfd['open'] - dfd['close'].shift(1), dround)
    # Разрыв реального тела между 1-й и 3-ей свечами
    dfd['f_gap_body_02'] = round(dfd['open'] - dfd['close'].shift(2), dround)
    # Разрыв реального тела между 2-й и 3-ей свечами
    dfd['f_gap_body_12'] = round(dfd['open'].shift(1) - dfd['close'].shift(2), dround)

    # Вычисляем наклоны между свечами
    # Наклон между минимальной ценой между 1-й и 2-й свечами
    dfd['f_slope_low_01'] = round(dfd['low'] - dfd['low'].shift(1), dround)
    # Наклон между максимальной ценой между 1-й и 2-й свечами
    dfd['f_slope_high_01'] = round(dfd['high'] - dfd['high'].shift(1), dround)
    # Наклон между минимальной ценой между 1-й и 3-ей свечами
    dfd['f_slope_low_02'] = round((dfd['low'] - dfd['low'].shift(2))/2, dround)
    # Наклон между максимальной ценой между 1-й и 3-ей свечами
    dfd['f_slope_high_02'] = round((dfd['high'] - dfd['high'].shift(2))/2, dround)

    # Реальный размер свечи (тело свечи). Возьму по модулю и добавлю в фичи
    dfd['f_body_size'] = round(abs(dfd['close'] - dfd['open']), dround)
    #dfd['f_body_size'] = round(dfd['close'] - dfd['open'], dround)
    
    # Вычисление нормированных по наибольшему размеру реальных размеров свечи (тел свечи)
    # Реальный размер тела 1-й свечи, нормированный на наибольший реальный размер тела
    dfd['f_normmax_body_size_0'] = round(dfd['f_body_size']/dfd['f_body_size'].rolling(3).max(), dround)
    # Реальный размер тела 2-й свечи, нормированный на наибольший реальный размер тела
    dfd['f_normmax_body_size_1'] = round(dfd['f_body_size'].shift(1)/dfd['f_body_size'].rolling(3).max(), dround)
    # Реальный размер тела 3-й свечи, нормированный на наибольший реальный размер тела
    dfd['f_normmax_body_size_2'] = round(dfd['f_body_size'].shift(2)/dfd['f_body_size'].rolling(3).max(), dround)

    # Вычисление реальных размеров свечей, нормированных общим размером
    # Реальный размер тела 1-й свечи, нормированный на ее общий размер.
    dfd['f_normtotal_body_size_0'] = round(dfd['f_body_size'] / dfd['f_can_size'], dround)
    # Реальный размер тела 2-й свечи, нормированный на ее общий размер.
    dfd['f_normtotal_body_size_1'] = round(dfd['f_body_size'].shift(1) / dfd['f_can_size'].shift(1), dround)
    # Реальный размер тела 3-й свечи, нормированный на ее общий размер.
    dfd['f_normtotal_body_size_2'] = round(dfd['f_body_size'].shift(2) / dfd['f_can_size'].shift(2), dround)

    # Добавляем целевую функцию
    dfd['target'] = np.where(dfd['f_body_size'].shift(-expiration) > barrier, 1, 0)
    
    # Удаляем строки с NaN
    dfd.dropna(inplace=True)

    return dfd


In [44]:
# Создаем датафрейм с признаками
dfm = add_features(df, dround=10, barrier=4.9, expiration=1)

In [45]:
dfm.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23367 entries, 2019-01-01 09:00:00 to 2021-09-01 00:00:00
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   open                     23367 non-null  float64
 1   high                     23367 non-null  float64
 2   low                      23367 non-null  float64
 3   close                    23367 non-null  float64
 4   f_trend                  23367 non-null  int64  
 5   f_can_col                23367 non-null  int64  
 6   f_can_col_1              23367 non-null  int64  
 7   f_can_col_2              23367 non-null  int64  
 8   f_can_size               23367 non-null  float64
 9   f_can_size_norm_max      23367 non-null  float64
 10  f_can_size_norm_max_1    23367 non-null  float64
 11  f_can_size_norm_max_2    23367 non-null  float64
 12  f_size_ratio_10          23367 non-null  float64
 13  f_size_ratio_20          23367 non-null  

In [46]:
dfm.describe()

Unnamed: 0,open,high,low,close,f_trend,f_can_col,f_can_col_1,f_can_col_2,f_can_size,f_can_size_norm_max,...,f_slope_low_02,f_slope_high_02,f_body_size,f_normmax_body_size_0,f_normmax_body_size_1,f_normmax_body_size_2,f_normtotal_body_size_0,f_normtotal_body_size_1,f_normtotal_body_size_2,target
count,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,...,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0,23367.0
mean,7173.97769,7179.96233,7167.891976,7173.928775,-0.006633,0.500321,0.500364,0.500321,12.070354,0.805576,...,-0.051939,-0.051877,6.174945,0.619519,0.619715,0.617881,0.473054,0.473029,0.473021,0.526555
std,645.651958,646.163434,645.05236,645.633592,0.999999,0.500011,0.500011,0.500011,3.842064,0.19451,...,5.068482,5.063043,4.695749,0.351369,0.351782,0.350387,0.263362,0.26335,0.263361,0.499305
min,6063.624,6072.768,6062.719,6063.765,-1.0,0.0,0.0,0.0,3.374,0.20268,...,-23.3645,-25.885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6560.5245,6565.5275,6554.516,6560.544,-1.0,0.0,0.0,0.0,9.297,0.652083,...,-3.383,-3.3925,2.4745,0.292675,0.291932,0.292791,0.253206,0.253206,0.253206,0.0
50%,7239.841,7246.52,7233.41,7239.774,-1.0,1.0,1.0,1.0,11.425,0.845891,...,-0.0225,-0.0635,5.204,0.65114,0.649035,0.63994,0.478671,0.478661,0.478661,1.0
75%,7786.2335,7792.3175,7779.012,7786.2365,1.0,1.0,1.0,1.0,14.166,1.0,...,3.28475,3.277,8.845,1.0,1.0,1.0,0.68791,0.687822,0.687822,1.0
max,8433.942,8434.386,8420.597,8434.212,1.0,1.0,1.0,1.0,40.221,1.0,...,22.2115,21.8215,36.108,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
#dfm.to_csv('/home/sash/Python/projects/mlfin/data/data_in/candles_pattern_features.csv')

In [47]:
dfm['target'].value_counts(normalize=True)

1    0.526555
0    0.473445
Name: target, dtype: float64

In [10]:
# Проверяем наличие столбцов с пропущенными данными
# dfm.isnull().sum()

In [48]:
# Создаем список столбцов с признаками (начинаются с 'f_')
f_columns = [i for i in dfm.columns.to_list() if i.startswith('f_')]

In [49]:
# Разбиваем выборку на две части. На первой части попробуем отдельно
# комплект простых алгоритмов ML  и Catboost
# На второй части будем пробовать CatBoost с прогнозными значениями алгоритмов,
# обученных на первой части 

X1 = dfm[f_columns][0:11600]
y1 = dfm['target'][0:11600]

X2 = dfm[f_columns][11600:]
y2 = dfm['target'][11600:]

In [50]:
from sklearn import model_selection, metrics

In [51]:
X1_train, X1_test, y1_train, y1_test = model_selection.train_test_split(
    X1, y1, test_size=0.3, random_state=42
)

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool, cv

In [53]:
# Прикидка разных моделей
for model in [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    RandomForestClassifier,
    CatBoostClassifier
]:
    cls = model()
    kfold = model_selection.KFold(
        n_splits = 10, random_state=42, shuffle=True,
    )
    s = model_selection.cross_val_score(
        cls, X1, y1, scoring='roc_auc', cv=kfold
    )
    print(
        f"{model.__name__:22} AUC: "
        f"{s.mean():.3f} STD: {s.std():.2f}"
    )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression     AUC: 0.497 STD: 0.01
DecisionTreeClassifier AUC: 0.504 STD: 0.01
KNeighborsClassifier   AUC: 0.498 STD: 0.02
GaussianNB             AUC: 0.499 STD: 0.02
RandomForestClassifier AUC: 0.501 STD: 0.01
Learning rate set to 0.028049
0:	learn: 0.6924611	total: 9.2ms	remaining: 9.19s
1:	learn: 0.6918896	total: 19.7ms	remaining: 9.81s
2:	learn: 0.6914054	total: 29.5ms	remaining: 9.79s
3:	learn: 0.6908510	total: 39.7ms	remaining: 9.9s
4:	learn: 0.6903674	total: 51.4ms	remaining: 10.2s
5:	learn: 0.6899348	total: 65.9ms	remaining: 10.9s
6:	learn: 0.6895109	total: 78.4ms	remaining: 11.1s
7:	learn: 0.6890648	total: 90ms	remaining: 11.2s
8:	learn: 0.6886397	total: 100ms	remaining: 11s
9:	learn: 0.6882323	total: 111ms	remaining: 10.9s
10:	learn: 0.6877818	total: 122ms	remaining: 10.9s
11:	learn: 0.6873973	total: 133ms	remaining: 10.9s
12:	learn: 0.6869869	total: 143ms	remaining: 10.8s
13:	learn: 0.6865519	total: 154ms	remaining: 10.8s
14:	learn: 0.6862006	total: 164ms	remaining:

In [85]:
# Результаты не фонтах, хотя особо многого я от них и не ждал в таком виде. 
# Попробуем отдельно catboost для этого набора данных

In [54]:
#learning with train and validation subsets
model = CatBoostClassifier(iterations=1000,
                        depth=6,
                        learning_rate=0.001,
                        custom_loss=['Accuracy'],
                        eval_metric='Accuracy',       
                        verbose=True,
                        random_strength=0.15,
                        use_best_model=True,
                        l2_leaf_reg=30,
                        random_seed=43,
                        task_type='CPU')
model.fit(
    X1_train, y1_train,
    eval_set = (X1_test, y1_test),
    plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.5674877	test: 0.5597701	best: 0.5597701 (0)	total: 17.3ms	remaining: 17.3s
1:	learn: 0.5692118	test: 0.5706897	best: 0.5706897 (1)	total: 31.5ms	remaining: 15.7s
2:	learn: 0.5683498	test: 0.5712644	best: 0.5712644 (2)	total: 42.9ms	remaining: 14.3s
3:	learn: 0.5671182	test: 0.5732759	best: 0.5732759 (3)	total: 54.2ms	remaining: 13.5s
4:	learn: 0.5671182	test: 0.5706897	best: 0.5732759 (3)	total: 64.1ms	remaining: 12.8s
5:	learn: 0.5635468	test: 0.5709770	best: 0.5732759 (3)	total: 81.7ms	remaining: 13.5s
6:	learn: 0.5633005	test: 0.5724138	best: 0.5732759 (3)	total: 97.4ms	remaining: 13.8s
7:	learn: 0.5642857	test: 0.5718391	best: 0.5732759 (3)	total: 109ms	remaining: 13.5s
8:	learn: 0.5645320	test: 0.5715517	best: 0.5732759 (3)	total: 120ms	remaining: 13.2s
9:	learn: 0.5642857	test: 0.5724138	best: 0.5732759 (3)	total: 132ms	remaining: 13.1s
10:	learn: 0.5637931	test: 0.5712644	best: 0.5732759 (3)	total: 151ms	remaining: 13.6s
11:	learn: 0.5634236	test: 0.5712644	best: 0.5

<catboost.core.CatBoostClassifier at 0x7f45c67bb100>

In [45]:
categorial_features = np.where(X1_train.dtypes != np.float)[0]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorial_features = np.where(X1_train.dtypes != np.float)[0]


In [46]:
categorial_features

array([0, 1, 2, 3])

In [55]:
from catboost.utils import get_roc_curve
from sklearn.metrics import auc

In [56]:
validate_pool = Pool(X1_test, y1_test)

In [57]:
curve = get_roc_curve(model, validate_pool)
(fpr, tpr, thresholds) = curve
roc_auc = auc(fpr, tpr)
roc_auc

0.4841096577168891

In [55]:
X1_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3480 entries, 2020-03-17 01:00:00 to 2019-07-16 00:00:00
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   f_trend                  3480 non-null   int64  
 1   f_can_col                3480 non-null   int64  
 2   f_can_col_1              3480 non-null   int64  
 3   f_can_col_2              3480 non-null   int64  
 4   f_can_size               3480 non-null   float64
 5   f_can_size_norm_max      3480 non-null   float64
 6   f_can_size_norm_max_1    3480 non-null   float64
 7   f_can_size_norm_max_2    3480 non-null   float64
 8   f_size_ratio_10          3480 non-null   float64
 9   f_size_ratio_20          3480 non-null   float64
 10  f_size_ratio_21          3480 non-null   float64
 11  f_gap_01                 3480 non-null   float64
 12  f_gap_02                 3480 non-null   float64
 13  f_gap_12                 3480 non-null   f

In [58]:
prediction = model.predict(X1_test)
prediction_proba = model.predict_proba(X1_test)
print(prediction[:10])
print(prediction_proba[:10])

[1 1 1 1 1 1 1 1 1 1]
[[0.4997847  0.5002153 ]
 [0.49994719 0.50005281]
 [0.49977979 0.50022021]
 [0.49960564 0.50039436]
 [0.4997663  0.5002337 ]
 [0.49968404 0.50031596]
 [0.49961792 0.50038208]
 [0.4997898  0.5002102 ]
 [0.49978868 0.50021132]
 [0.49978063 0.50021937]]


In [59]:
print(y1_test[:10])

datetime
2020-03-17 01:00:00    1
2019-06-02 07:00:00    1
2019-05-07 16:00:00    1
2020-02-15 17:00:00    1
2019-04-18 23:00:00    0
2019-10-28 00:00:00    1
2019-03-25 01:00:00    0
2019-02-06 01:00:00    1
2019-07-30 13:00:00    1
2019-01-23 14:00:00    0
Name: target, dtype: int64


In [60]:
from sklearn.metrics import confusion_matrix

In [61]:

cfm = confusion_matrix(y1_test, prediction)

In [62]:
cfm

array([[  36, 1452],
       [  33, 1959]])