### Я взял код с пары.



Метрики оставил те же, а именно:

MAE (Mean Absolute Error) — средняя абсолютная ошибка модели, вычисляемая как среднее разниц между фактическими и предсказанными значениями.

MAE_baseline — MAE базовой модели, которая всегда предсказывает среднее значение целевой переменной на обучающих данных.

MAE_shift — MAE наивной модели, использующей правило "следующее значение равно предыдущему" (прогноз последнего известного значения).

same_direction_ratio — доля случаев, когда модель правильно предсказывает направление изменения (рост/падение) по сравнению с предыдущим значением


### Самые лучшие метрики дал KFold с RandomForest
'mae': 1.1070635220125786 с 'mae_baseline': 1.1080811676753288


In [19]:
from datetime import datetime

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor


In [20]:
df = pd.read_csv('./data/processed/mart.csv')

Возьмем линейную регрессию

In [21]:
df_1 = df.dropna()

# Определение целевой переменной и признаков
df_1 = df_1.drop(columns=['Unnamed: 0'])
target = 'raw_mix.lab.measure.sito_009'
features = df_1.columns[df_1.columns != target]

# Разделение данных
X = df_1[features]
y = df_1[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Скалирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Построение модели
model = LinearRegression()

model.fit(X_train_scaled, y_train)

# Оценка модели
y_pred = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mae_baseline = mean_absolute_error(y_test, np.full_like(y_test, y_test.mean()))
mae_baseline_shift = mean_absolute_error(y_test[:-1], y_test[1:])

# Вычисление разностей для реальных и предсказанных значений
y_test_diff = y_test[1:].values - y_test[:-1].values
y_pred_diff = y_pred[1:] - y_pred[:-1]
# Подсчет доли сонаправленных изменений
same_direction = np.sum((y_test_diff * y_pred_diff) > 0) / len(y_test_diff)

print({
    # 'features': list(features),
    'metrics': {
        'mae': float(mae),
        'mae_baseline': float(mae_baseline),
        'mae_baseline_shift': float(mae_baseline_shift),
        'same_direction_ratio': float(same_direction),
        'mse': float(mse)
    }
})

{'metrics': {'mae': 1.0623615668067694, 'mae_baseline': 0.9656721536351165, 'mae_baseline_shift': 1.1934579439252333, 'same_direction_ratio': 0.4766355140186916, 'mse': 1.926608927658215}}


Теперь возьмем TimeSeriesSplit с различными моделями

In [22]:
def evaluate_models(df, target_col, models_dict):
    df_clean = df.dropna()
    features = df_clean.columns[df_clean.columns != target_col].tolist()

    tscv = TimeSeriesSplit(n_splits=10)
    results = {}

    for model_name, model in models_dict.items():
        mae_scores, mse_scores, direction_ratios = [], [], []

        for train_index, test_index in tscv.split(df_clean):
            X_train, X_test = df_clean[features].iloc[train_index], df_clean[features].iloc[test_index]
            y_train, y_test = df_clean[target_col].iloc[train_index], df_clean[target_col].iloc[test_index]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)

            mae_scores.append(mean_absolute_error(y_test, y_pred))
            mse_scores.append(mean_squared_error(y_test, y_pred))

            # роверка направления изменений
            if len(y_test) > 1 and len(y_pred) > 1:
                y_test_diff = y_test[1:].values - y_test[:-1].values
                y_pred_diff = y_pred[1:] - y_pred[:-1]
                direction_ratios.append(np.mean((y_test_diff * y_pred_diff) > 0))

        avg_mae = np.mean(mae_scores)
        avg_mse = np.mean(mse_scores)
        avg_direction = np.mean(direction_ratios) if direction_ratios else 0.0


        last_baseline = mean_absolute_error(y_test, np.full_like(y_test, y_test.mean()))
        last_baseline_shift = mean_absolute_error(y_test[:-1], y_test[1:]) if len(y_test) > 1 else 0.0

        results[model_name] = {
            'metrics': {
                'mae': float(avg_mae),
                'mse': float(avg_mse),
                'same_direction_ratio': float(avg_direction),
                'mae_baseline': float(last_baseline),
                'mae_baseline_shift': float(last_baseline_shift)
            }
        }

    return results

In [23]:
df_2 = df.dropna()
target = 'raw_mix.lab.measure.sito_009'
features = df_2.columns[df_2.columns != target].tolist()

models_to_test = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results_2 = evaluate_models(df, target_col="raw_mix.lab.measure.sito_009", models_dict=models_to_test)
print(results_2)

{'LinearRegression': {'metrics': {'mae': 1.3857594171483325, 'mse': 3.201154344739918, 'same_direction_ratio': 0.4770833333333334, 'mae_baseline': 0.9729279466888794, 'mae_baseline_shift': 1.1208333333333331}}, 'RandomForest': {'metrics': {'mae': 1.2131122448979592, 'mse': 2.387568679591836, 'same_direction_ratio': 0.5229166666666667, 'mae_baseline': 0.9729279466888794, 'mae_baseline_shift': 1.1208333333333331}}}


Теперь возьмем KFold с различными моделями

In [26]:
def evaluate_models_kfold(df, target_col, models_dict, n_splits=10, shuffle=True, random_state=42):
    df_clean = df.dropna()
    features = df_clean.columns[df_clean.columns != target_col].tolist()

    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    results = {}

    for model_name, model in models_dict.items():
        mae_scores, mse_scores, direction_ratios = [], [], []

        for train_index, test_index in kf.split(df_clean):
            X_train, X_test = df_clean[features].iloc[train_index], df_clean[features].iloc[test_index]
            y_train, y_test = df_clean[target_col].iloc[train_index], df_clean[target_col].iloc[test_index]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)

            mae_scores.append(mean_absolute_error(y_test, y_pred))
            mse_scores.append(mean_squared_error(y_test, y_pred))

            # роверка направления изменений
            if len(y_test) > 1 and len(y_pred) > 1:
                y_test_diff = y_test[1:].values - y_test[:-1].values
                y_pred_diff = y_pred[1:] - y_pred[:-1]
                direction_ratios.append(np.mean((y_test_diff * y_pred_diff) > 0))

        avg_mae = np.mean(mae_scores)
        avg_mse = np.mean(mse_scores)
        avg_direction = np.mean(direction_ratios) if direction_ratios else 0.0

        last_baseline = mean_absolute_error(y_test, np.full_like(y_test, y_test.mean()))
        last_baseline_shift = mean_absolute_error(y_test[:-1], y_test[1:]) if len(y_test) > 1 else 0.0

        results[model_name] = {
            'metrics': {
                'mae': float(avg_mae),
                'mse': float(avg_mse),
                'same_direction_ratio': float(avg_direction),
                'mae_baseline': float(last_baseline),
                'mae_baseline_shift': float(last_baseline_shift)
            },
            'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S')
        }
    return results


In [29]:
df_3 = df.dropna()

models_to_test = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Запуск оценки с KFold
results_3 = evaluate_models_kfold(
    df_3,
    target_col="raw_mix.lab.measure.sito_009",
    models_dict=models_to_test,
    n_splits=10,
    shuffle=True,
    random_state=42
)
print(results_3)

{'LinearRegression': {'metrics': {'mae': 1.1754031762502613, 'mse': 2.09444400638355, 'same_direction_ratio': 0.572677793904209, 'mae_baseline': 1.1080811676753288, 'mae_baseline_shift': 1.4288461538461539}, 'timestamp': '20250510_103153'}, 'RandomForest': {'metrics': {'mae': 1.1070635220125786, 'mse': 1.905746439552761, 'same_direction_ratio': 0.5654208998548621, 'mae_baseline': 1.1080811676753288, 'mae_baseline_shift': 1.4288461538461539}, 'timestamp': '20250510_103158'}}
