In [49]:
import os
import pandas as pd
import logging

# Отключаем логи cmdstanpy
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)

df = pd.read_csv(r"data\stocks_months_data.csv")
df["Date"] = pd.to_datetime(df["Date"])

drop_list = ["Open", "High", "Low", "Close", 
             "Volume", "next_month_close", "target_return"]
df = df.drop(drop_list, axis=1)
df.rename(columns={"month_return": "y", "Date": "ds"}, inplace=True)
df.head()

Unnamed: 0,ds,symbol,y,ema_3,ema_6,ema_12,ema_24,rsi_3,rsi_7,rsi_14,macd_fast,macd_slow,atr_7,atr_14,obv_short,cmf_short
0,2017-07-01,A,0.082455,55.489123,53.160177,46.536876,43.520215,85.571138,81.950379,76.38866,3.944818,7.004007,3.603024,3.728618,323835200.0,0.348466
1,2017-08-01,A,-0.008035,58.271774,55.415676,48.065039,44.922952,93.828991,87.100763,80.413313,4.356514,7.416387,3.986071,3.938209,367257800.0,0.383057
2,2017-09-01,A,0.059657,59.417822,56.886589,49.380705,46.174225,86.038235,84.14606,78.88591,4.467989,7.615828,3.781354,3.825643,339128000.0,0.240049
3,2017-10-01,A,0.019883,61.797389,58.969551,50.938205,47.614444,92.718114,87.72434,81.650471,4.771016,7.973518,3.823919,3.846039,370600500.0,0.339357
4,2017-11-01,A,-0.032784,63.625192,60.821963,52.466078,49.041528,94.19056,88.768809,82.520919,4.968774,8.264686,3.910642,3.892773,412584200.0,0.284746


In [41]:
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import itertools

import optuna
from prophet import Prophet
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np
from tqdm import tqdm
from catboost import CatBoostRegressor

best_params = {'changepoint_prior_scale': 0.1, 'seasonality_prior_scale': 0.1, 
               'seasonality_mode': 'additive', 'yearly_seasonality': True}

def objective(trial, df):
    params = {
        'changepoint_prior_scale': trial.suggest_categorical('changepoint_prior_scale', [0.001, 0.01, 0.1, 0.5]),
        'seasonality_prior_scale': trial.suggest_categorical('seasonality_prior_scale', [0.01, 0.1, 1.0, 10.0]),
        'seasonality_mode': trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative']),
        'yearly_seasonality': trial.suggest_categorical('yearly_seasonality', [True, False]),
        'weekly_seasonality': False,
        'daily_seasonality': False
    }
    
    tscv = TimeSeriesSplit(n_splits=5)
    r2_scores = []
    
    for train_idx, val_idx in tscv.split(df):
        train_df = df.iloc[train_idx]
        val_df = df.iloc[val_idx]
        
        model = Prophet(**params)
        model.fit(train_df)
        
        forecast = model.predict(val_df)
        y_true = val_df['y']

        y_pred = forecast['yhat']
        r2_scores.append(r2_score(y_true, y_pred))
    
    return np.mean(r2_scores)

def walk_forward_validation_by_symbol(df, model_type='prophet', train_years=('2015-01-01', '2017-12-31'),
                                     val_years=('2018-01-01', '2018-12-31'), test_years=('2019-01-01', '2019-12-31'),
                                     params_prophet={}, params_catboost={"n_estimators": 500, "verbose": 0},
                                     save_model=0):
    symbols = df['symbol'].unique()
    results = {}
    
    for symbol in tqdm(symbols):
        symbol_df = df[df['symbol'] == symbol]
        
        # Разделение на train, val и test
        train = symbol_df[(symbol_df.ds >= train_years[0]) & (symbol_df.ds <= train_years[1])]
        val = symbol_df.loc[(symbol_df.ds >= val_years[0]) & (symbol_df.ds <= val_years[1])]
        test = symbol_df.loc[(symbol_df.ds >= test_years[0]) & (symbol_df.ds <= test_years[1])]


        
        if len(train) == 0 or len(val) == 0 or len(test) == 0:
            print(f"Недостаточно данных для символа {symbol}")
            continue
            
        if model_type == 'prophet':
           
            #train_val = pd.concat([train, val])
            #study = optuna.create_study(direction='maximize')

            #study.optimize(lambda trial: objective(trial, train_val), n_trials=50)
            
            model = Prophet(**best_params)
            regressors = [col for col in symbol_df.columns if col not in ['symbol', 'y', 'ds']]
            # Добавляем регрессоры
            for reg in regressors:
                model.add_regressor(reg)
            
            model.fit(train)
            
            train_pred = model.predict(train)['yhat']
            val_pred = model.predict(val)['yhat']
            test_pred = model.predict(test)['yhat']

        elif model_type == 'catboost':
            
            model = CatBoostRegressor(**params_catboost)
            X = train.drop(["symbol", "y", "ds"], axis=1)
            model.fit(X, train["y"])
            
            if save_model == 1:
                model_path = f"models_m/catboost_{symbol}.cbm"
                model.save_model(str(model_path))
            
            train_cat = train.drop(["symbol", "y", "ds"], axis=1)
            val_cat = val.drop(["symbol", "y", "ds"], axis=1)
            test_pred = test.drop(["symbol", "y", "ds"], axis=1)

            train_pred = model.predict(train_cat)
            val_pred = model.predict(val_cat)
            test_pred = model.predict(test_pred)


        train_metrics = {
            'MAE_train': mean_absolute_error(train['y'], train_pred),
            'RMSE_train': mean_squared_error(train['y'], train_pred, squared=False),
            'MAPE_train': mean_absolute_percentage_error(train['y'], train_pred),
            'R2_train': r2_score(train['y'], train_pred)
        }
        
        val_metrics = {
            'MAE_val': mean_absolute_error(val['y'], val_pred),
            'RMSE_val': mean_squared_error(val['y'], val_pred, squared=False),
            'MAPE_val': mean_absolute_percentage_error(val['y'], val_pred),
            'R2_val': r2_score(val['y'], val_pred)
        }
        
        test_metrics = {
            'MAE_test': mean_absolute_error(test['y'], test_pred),
            'RMSE_test': mean_squared_error(test['y'], test_pred, squared=False),
            'MAPE_test': mean_absolute_percentage_error(test['y'], test_pred),
            'R2_test': r2_score(test['y'], test_pred)
        }
        
        results[symbol] = {**train_metrics, **val_metrics, **test_metrics}
        #print(symbol)
        #display(results[symbol])
    
    return pd.DataFrame(results).T


In [17]:
result_before_optuna = result_pr

In [50]:
params = {'iterations': 341, 'learning_rate': 0.013018333801783908, 'depth': 4, 
          'l2_leaf_reg': 0.38037483754702794, 'random_strength': 0.834055227554984, 
          'bagging_temperature': 0.18298898966937419, 'border_count': 125,
          'verbose': 0}
params

{'iterations': 341,
 'learning_rate': 0.013018333801783908,
 'depth': 4,
 'l2_leaf_reg': 0.38037483754702794,
 'random_strength': 0.834055227554984,
 'bagging_temperature': 0.18298898966937419,
 'border_count': 125,
 'verbose': 0}

In [51]:
result_pr = walk_forward_validation_by_symbol(
    df,
    model_type='catboost',
    train_years=('2015-01-01', '2020-12-31'),  # 6 лет обучения
    val_years=('2021-01-01', '2022-12-31'),    # 2 года валидации
    test_years=('2023-01-01', '2023-12-31'),   # 1 года тестирования
    params_catboost=params,
    save_model=0
)

 69%|██████▉   | 331/479 [01:15<00:23,  6.24it/s]

Недостаточно данных для символа OGN


100%|██████████| 479/479 [01:46<00:00,  4.50it/s]


In [53]:
predictable_symbols = result_pr[((result_pr["R2_val"]) > 0)]\
.index.to_list()
len(predictable_symbols)

63

In [54]:
df_symbol = df[df.symbol.isin(predictable_symbols)]
result_pr = walk_forward_validation_by_symbol(
    df_symbol,
    model_type='catboost',
    train_years=('2015-01-01', '2020-12-31'),  # 6 лет обучения
    val_years=('2021-01-01', '2022-12-31'),    # 2 года валидации
    test_years=('2023-01-01', '2023-12-31'),   # 1 года тестирования
    params_catboost=params,
    save_model=1
)

100%|██████████| 63/63 [00:15<00:00,  3.94it/s]


In [30]:
result_before_optuna[((result_before_optuna["R2_test"]) > 0)]\
[["R2_val", "R2_test"]].mean()

R2_val    -0.455738
R2_test    0.125933
dtype: float64

In [33]:
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)

In [34]:
result_profet = walk_forward_validation_by_symbol(
    df,
    model_type='prophet',
    train_years=('2015-01-01', '2020-12-31'),  # 6 лет обучения
    val_years=('2021-01-01', '2022-12-31'),    # 2 года валидации
    test_years=('2023-01-01', '2023-12-31'),   # 1 года тестирования
)

 69%|██████▊   | 329/479 [03:12<01:32,  1.62it/s]

Недостаточно данных для символа OGN


100%|██████████| 479/479 [04:41<00:00,  1.70it/s]


In [35]:
result_profet

Unnamed: 0,MAE_train,RMSE_train,MAPE_train,R2_train,MAE_val,RMSE_val,MAPE_val,R2_val,MAE_test,RMSE_test,MAPE_test,R2_test
A,0.035513,0.042662,2.419599,0.596977,0.290409,0.335883,36.548879,-18.138155,0.599876,0.609753,21.898229,-40.441190
AAL,0.073564,0.092449,3.773578,0.416133,0.477926,0.507657,9.086900,-16.604870,0.943628,0.950333,17.011898,-75.102722
AAPL,0.051265,0.063934,2.190723,0.435320,0.407220,0.462642,7.992525,-29.058810,1.291838,1.334248,83.563216,-473.841798
ABBV,0.050144,0.061689,2.485574,0.466903,0.096402,0.124884,5.822791,-2.462041,0.122127,0.143465,5.371121,-5.484919
ABT,0.036785,0.048846,2.479695,0.369507,0.070666,0.088612,6.671813,-1.734714,0.056654,0.070102,2.580290,-0.206229
...,...,...,...,...,...,...,...,...,...,...,...,...
YUM,0.033816,0.045241,1.762635,0.553695,0.086543,0.104100,17.270115,-1.432636,0.058154,0.065872,1.618518,-0.750703
ZBH,0.041912,0.051398,3.001539,0.569494,0.412793,0.471819,47.130026,-52.409748,0.451137,0.464583,8.351410,-31.152353
ZBRA,0.065590,0.080498,1.590196,0.500474,0.177989,0.215570,18.607609,-2.035593,1.028948,1.092780,10.587503,-103.472049
ZION,0.050813,0.064273,2.358875,0.542807,0.551151,0.595817,43.946730,-65.899811,0.483511,0.514039,27.774250,-5.958299


In [37]:
result_profet[((result_profet["R2_test"]) > 0)]

Unnamed: 0,MAE_train,RMSE_train,MAPE_train,R2_train,MAE_val,RMSE_val,MAPE_val,R2_val,MAE_test,RMSE_test,MAPE_test,R2_test
BXP,0.044126,0.059647,1.579116,0.419969,0.090469,0.102883,1.978189,-0.850382,0.093548,0.121167,4.177563,0.101162
CMS,0.031606,0.039664,2.564669,0.237903,0.061291,0.070975,2.823049,-0.400139,0.037529,0.044378,1.200171,0.09156
ISRG,0.043676,0.052063,1.414443,0.421483,0.120018,0.153499,1.697861,-0.659576,0.075964,0.091476,0.925641,0.140218
MNST,0.042117,0.05372,1.676427,0.538914,0.093696,0.111307,2.517385,-2.138999,0.034232,0.040839,6.789218,0.211855
PAYX,0.028595,0.03679,2.295703,0.599423,0.085834,0.112339,2.458861,-1.35731,0.047386,0.054468,1.076389,0.161285
VFC,0.046048,0.061912,2.662053,0.404484,0.117825,0.13196,3.236853,-0.817635,0.085262,0.102109,4.899457,0.367558


In [None]:
## Скоринг

In [55]:
import os

directory = r'C:\Users\user\Documents\diplom\research\etna_models\models_m'  # Укажите путь к вашей директории
files = os.listdir(directory)

# Фильтрация только файлов (исключая поддиректории)
files_only = [f for f in files if os.path.isfile(os.path.join(directory, f))]

print(files_only)

['catboost_AMT.cbm', 'catboost_AOS.cbm', 'catboost_APTV.cbm', 'catboost_BBWI.cbm', 'catboost_BEN.cbm', 'catboost_BIIB.cbm', 'catboost_CAT.cbm', 'catboost_CCI.cbm', 'catboost_CHRW.cbm', 'catboost_CL.cbm', 'catboost_CMI.cbm', 'catboost_CPB.cbm', 'catboost_D.cbm', 'catboost_DG.cbm', 'catboost_DHI.cbm', 'catboost_DLR.cbm', 'catboost_DXC.cbm', 'catboost_DXCM.cbm', 'catboost_EQIX.cbm', 'catboost_ES.cbm', 'catboost_FFIV.cbm', 'catboost_FMC.cbm', 'catboost_HRL.cbm', 'catboost_HUM.cbm', 'catboost_IP.cbm', 'catboost_IQV.cbm', 'catboost_ISRG.cbm', 'catboost_JNJ.cbm', 'catboost_KDP.cbm', 'catboost_KEYS.cbm', 'catboost_KMB.cbm', 'catboost_LDOS.cbm', 'catboost_LEN.cbm', 'catboost_LIN.cbm', 'catboost_MDLZ.cbm', 'catboost_MHK.cbm', 'catboost_MTD.cbm', 'catboost_NEM.cbm', 'catboost_NFLX.cbm', 'catboost_NRG.cbm', 'catboost_NWS.cbm', 'catboost_NWSA.cbm', 'catboost_NXPI.cbm', 'catboost_ODFL.cbm', 'catboost_PANW.cbm', 'catboost_PG.cbm', 'catboost_PKG.cbm', 'catboost_PNR.cbm', 'catboost_PNW.cbm', 'catboost_

In [59]:
features = df_symbol.columns.drop(['symbol']).tolist()

# Пустой список для хранения прогнозов
predictions = []

# Итерируемся по каждому символу
for symbol, group in df_symbol.groupby('symbol'):
    # Загружаем/берём модель для symbol

    model = CatBoostRegressor()
    model.load_model(f'models_m/catboost_{symbol}.cbm')

    # Формируем матрицу признаков
    X = group[features].drop(["ds", "y"], axis=1)

    # Делаем предсказание
    y_pred = model.predict(X)

    # Сохраняем результаты
    group = group.copy()
    group['prediction'] = y_pred
    predictions.append(group[['symbol', 'ds', 'prediction', 'y']])

# Объединяем всё в один DataFrame
predictions_df = pd.concat(predictions, axis=0)

In [61]:
predictions_df\
.to_csv('data/scores_for_top.csv', index=0)

In [62]:
pd.read_csv('data/scores_for_top.csv')

Unnamed: 0,symbol,ds,prediction,y
0,AMT,2015-06-01,0.012709,0.024345
1,AMT,2015-07-01,-0.016317,-0.030701
2,AMT,2015-08-01,-0.020526,-0.045667
3,AMT,2015-09-01,0.129743,0.167906
4,AMT,2015-10-01,-0.016903,-0.027878
...,...,...,...,...
7492,XEL,2024-12-01,0.001963,-0.004739
7493,XEL,2025-01-01,-0.006069,0.081806
7494,XEL,2025-02-01,-0.020242,-0.018169
7495,XEL,2025-03-01,-0.010216,0.007088
