In [None]:
import os
import pandas as pd
import logging

# Отключаем логи cmdstanpy
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)

df = pd.read_csv(r"data\stocks_with_indicators_months_sp490.csv")
df["Date"] = pd.to_datetime(df["Date"])

drop_list = ["Open", "High", "Low", "Close", 
             "Volume", 'future_price', "month_return",
             'annual_return']
df = df.drop(drop_list, axis=1)
df.rename(columns={"future_month_price": "y", "Date": "ds"}, inplace=True)
df.head()

Unnamed: 0,ds,symbol,y,EMA_9,EMA_20,MACD_,MACD_hist_,MACD_signal_,RSI_7,RSI_14,STOCH_k_,STOCH_d_,CCI_14,ATR_10
0,2015-07-02,A,37.819344,36.422359,36.850444,-0.612101,0.079867,-0.691969,50.249128,44.761838,33.973509,24.610816,-13.321123,0.647886
1,2015-07-06,A,37.865509,36.408069,36.802869,-0.579963,0.089605,-0.669568,46.13447,42.707166,47.451993,32.162365,-36.571277,0.636342
2,2015-07-07,A,37.514584,36.476061,36.797646,-0.516495,0.122458,-0.638953,54.61019,47.755268,56.564186,45.996563,16.623668,0.632646
3,2015-07-08,A,37.606934,36.338358,36.701446,-0.537503,0.08116,-0.618663,37.818862,38.84131,42.294775,48.770318,-110.128065,0.669965
4,2015-07-09,A,37.052799,36.259595,36.62936,-0.535313,0.06668,-0.601993,41.262932,40.786909,34.730089,44.529684,-78.728661,0.656257


In [11]:
def select_top_features(df, target_col='y'):
    # Вычисляем корреляцию
    corr = df.corr()[target_col].abs().sort_values(ascending=False)
    
    # Исключаем таргет и временные колонки
    features = corr.drop([target_col, 'ds', 'symbol'], errors='ignore')
    
    # Возвращаем топ-N фичей
    return features

# Пример для одного символа
top_features = select_top_features(df)
print(f"Top features: {top_features}")

Top features: EMA_9           0.991545
EMA_20          0.990562
ATR_10          0.820886
MACD_signal_    0.277524
MACD_           0.259215
RSI_7           0.017829
CCI_14          0.016178
STOCH_d_        0.014086
STOCH_k_        0.013531
MACD_hist_      0.008424
Name: y, dtype: float64


  corr = df.corr()[target_col].abs().sort_values(ascending=False)


In [26]:
import logging
import optuna

# Отключаем логирование Optuna
logging.getLogger("optuna").setLevel(logging.WARNING)

In [85]:
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import itertools

import optuna
from prophet import Prophet
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np
from tqdm import tqdm
from catboost import CatBoostRegressor

best_params = {'changepoint_prior_scale': 0.1, 'seasonality_prior_scale': 0.1, 
               'seasonality_mode': 'additive', 'yearly_seasonality': True}

def objective(trial, df):
    params = {
        'changepoint_prior_scale': trial.suggest_categorical('changepoint_prior_scale', [0.001, 0.01, 0.1, 0.5]),
        'seasonality_prior_scale': trial.suggest_categorical('seasonality_prior_scale', [0.01, 0.1, 1.0, 10.0]),
        'seasonality_mode': trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative']),
        'yearly_seasonality': trial.suggest_categorical('yearly_seasonality', [True, False]),
        'weekly_seasonality': False,
        'daily_seasonality': False
    }
    
    tscv = TimeSeriesSplit(n_splits=5)
    r2_scores = []
    
    for train_idx, val_idx in tscv.split(df):
        train_df = df.iloc[train_idx]
        val_df = df.iloc[val_idx]
        
        model = Prophet(**params)
        model.fit(train_df)
        
        forecast = model.predict(val_df)
        y_true = val_df['y']

        y_pred = forecast['yhat']
        r2_scores.append(r2_score(y_true, y_pred))
    
    return np.mean(r2_scores)

def walk_forward_validation_by_symbol(df, model_type='prophet', train_years=('2015-01-01', '2017-12-31'),
                                     val_years=('2018-01-01', '2018-12-31'), test_years=('2019-01-01', '2019-12-31'),
                                     params_prophet={}, params_catboost={"n_estimator": 500, "verbose": 0}):
    symbols = df['symbol'].unique()
    results = {}
    
    for symbol in tqdm(symbols):
        symbol_df = df[df['symbol'] == symbol]
        
        # Разделение на train, val и test
        train = symbol_df[(symbol_df.ds >= train_years[0]) & (symbol_df.ds <= train_years[1])]
        val = symbol_df.loc[(symbol_df.ds >= val_years[0]) & (symbol_df.ds <= val_years[1])]
        test = symbol_df.loc[(symbol_df.ds >= test_years[0]) & (symbol_df.ds <= test_years[1])]


        
        if len(train) == 0 or len(val) == 0 or len(test) == 0:
            print(f"Недостаточно данных для символа {symbol}")
            continue
            
        if model_type == 'prophet':
           
            #train_val = pd.concat([train, val])
            #study = optuna.create_study(direction='maximize')

            #study.optimize(lambda trial: objective(trial, train_val), n_trials=50)
            
            model = Prophet(**best_params)
            regressors = [col for col in symbol_df.columns if col not in ['symbol', 'y', 'ds']]
            # Добавляем регрессоры
            for reg in regressors:
                model.add_regressor(reg)
            
            model.fit(train)
            
            train_pred = model.predict(train)['yhat']
            val_pred = model.predict(val)['yhat']
            test_pred = model.predict(test)['yhat']

        elif model_type == 'catboost':
            model = CatBoostRegressor(**params_catboost)
            X = train.drop(["symbol", "y", "ds"], axis=1)
            model.fit(X, train["y"])

            model_path = f"models/catboost_{symbol}.cbm"
            model.save_model(str(model_path))
            
            train_cat = train.drop(["symbol", "y", "ds"], axis=1)
            val_cat = val.drop(["symbol", "y", "ds"], axis=1)
            test_pred = test.drop(["symbol", "y", "ds"], axis=1)

            train_pred = model.predict(train_cat)
            val_pred = model.predict(val_cat)
            test_pred = model.predict(test_pred)


        train_metrics = {
            'MAE_train': mean_absolute_error(train['y'], train_pred),
            'RMSE_train': mean_squared_error(train['y'], train_pred, squared=False),
            'MAPE_train': mean_absolute_percentage_error(train['y'], train_pred),
            'R2_train': r2_score(train['y'], train_pred)
        }
        
        val_metrics = {
            'MAE_val': mean_absolute_error(val['y'], val_pred),
            'RMSE_val': mean_squared_error(val['y'], val_pred, squared=False),
            'MAPE_val': mean_absolute_percentage_error(val['y'], val_pred),
            'R2_val': r2_score(val['y'], val_pred)
        }
        
        test_metrics = {
            'MAE_test': mean_absolute_error(test['y'], test_pred),
            'RMSE_test': mean_squared_error(test['y'], test_pred, squared=False),
            'MAPE_test': mean_absolute_percentage_error(test['y'], test_pred),
            'R2_test': r2_score(test['y'], test_pred)
        }
        
        results[symbol] = {**train_metrics, **val_metrics, **test_metrics}
        #print(symbol)
        #display(results[symbol])
    
    return pd.DataFrame(results).T

# Пример использования
#features = top_features.index[:10].to_list() + ["y", "ds", "symbol"]

# result_pr = walk_forward_validation_by_symbol(
#     df,
#     model_type='catboost',
#     train_years=('2015-01-01', '2020-12-31'),  # 6 лет обучения
#     val_years=('2021-01-01', '2022-12-31'),    # 2 года валидации
#     test_years=('2023-01-01', '2023-12-31'),   # 1 года тестирования
# )

In [66]:
result_pr[(result_pr["R2_test"] > 0.1) & (result_pr["R2_val"] > 0.1)]\
.sort_values(["R2_test", "R2_val"], ascending=False)\
[["R2_val", "R2_test"]]

Unnamed: 0,R2_val,R2_test
FMC,0.130282,0.741993
AKAM,0.579447,0.729639
INTC,0.727137,0.661575
ILMN,0.496405,0.658594
CPB,0.52959,0.64628
TTWO,0.64408,0.607808
DLR,0.250167,0.50613
RCL,0.330059,0.426828
LVS,0.659088,0.409219
TPR,0.216685,0.401429


Теперь сохраним модели после оптимизации гиперпараметров по наиболее стабильной относительно прогнозов котировки INTC

In [69]:
train_years=('2015-01-01', '2020-12-31')
val_years=('2021-01-01', '2022-12-31')

symbol_df = df[df['symbol'] == "INTC"]
train = symbol_df[(symbol_df.ds >= train_years[0]) & (symbol_df.ds <= train_years[1])]
val = symbol_df.loc[(symbol_df.ds >= val_years[0]) & (symbol_df.ds <= val_years[1])]

X_train = train.drop(['ds', 'symbol', 'y'],
                    axis=1)
X_val = val.drop(['ds', 'symbol', 'y'],
                    axis=1)
y_train, y_val = train['y'], val['y']

In [70]:
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.datasets import make_regression

# Генерация синтетических данных (замените на свои данные)
def objective(trial):
    # Определяем гиперпараметры для оптимизации
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-5, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'verbose': False,  # Отключаем вывод в процессе обучения
    }

    # Инициализация и обучение модели
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train)

    # Предсказание и оценка качества
    y_pred = model.predict(X_val)
    r2 = r2_score(y_val, y_pred)

    return r2  # Optuna максимизирует метрику (чем выше R2, тем лучше)

# Оптимизация
study = optuna.create_study(direction='maximize')  # Максимизируем R2-score
study.optimize(objective, n_trials=100)  # Количество итераций оптимизации

# Вывод результатов
print("Лучшие гиперпараметры:", study.best_params)
print("Лучший R2-score:", study.best_value)

Лучшие гиперпараметры: {'iterations': 341, 'learning_rate': 0.013018333801783908, 'depth': 4, 'l2_leaf_reg': 0.38037483754702794, 'random_strength': 0.834055227554984, 'bagging_temperature': 0.18298898966937419, 'border_count': 125}
Лучший R2-score: 0.8194532254803761


In [79]:
params = {'iterations': 341, 'learning_rate': 0.013018333801783908, 'depth': 4, 
          'l2_leaf_reg': 0.38037483754702794, 'random_strength': 0.834055227554984, 
          'bagging_temperature': 0.18298898966937419, 'border_count': 125,
          'verbose': 0}
params

{'iterations': 341,
 'learning_rate': 0.013018333801783908,
 'depth': 4,
 'l2_leaf_reg': 0.38037483754702794,
 'random_strength': 0.834055227554984,
 'bagging_temperature': 0.18298898966937419,
 'border_count': 125,
 'verbose': 0}

In [86]:
top_symbols = result_pr[(result_pr["R2_test"] > 0.1) & (result_pr["R2_val"] > 0.1)]\
    .index.to_list()

df_s = df[df.symbol.isin(top_symbols)]
result_pr = walk_forward_validation_by_symbol(
    df_s,
    model_type='catboost',
    train_years=('2015-01-01', '2020-12-31'),  # 6 лет обучения
    val_years=('2021-01-01', '2022-12-31'),    # 2 года валидации
    test_years=('2023-01-01', '2023-12-31'),     # 1 года тестирования
    params_catboost=params
)

100%|██████████| 21/21 [00:06<00:00,  3.13it/s]


In [None]:
result_pr[(result_pr["R2_test"] > 0.1) & (result_pr["R2_val"] > 0.1)]\
.sort_values(["R2_test", "R2_val"], ascending=False)\
[["R2_val", "R2_test"]]

Unnamed: 0,R2_val,R2_test
ILMN,0.510634,0.700136
INTC,0.819453,0.694785
CPB,0.614011,0.63752
AKAM,0.511781,0.598737
TTWO,0.706642,0.589187
DLR,0.205978,0.586605
TPR,0.364495,0.569568
RCL,0.330834,0.518104
AMZN,0.46319,0.447142
CMCSA,0.504245,0.444278


In [88]:
result_pr.index.to_list()

['ADBE',
 'AKAM',
 'AMZN',
 'BWA',
 'CMCSA',
 'CPB',
 'CRM',
 'CTSH',
 'DLR',
 'ECL',
 'GPN',
 'HAL',
 'HII',
 'ILMN',
 'INTC',
 'LVS',
 'META',
 'NFLX',
 'RCL',
 'TPR',
 'TTWO']