# Подключение зависимостей

In [262]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import ccxt
from IPython.display import clear_output, display
import dill

#sns.set_theme()
%matplotlib inline

In [3]:
import mlflow
from mlflow.tracking import MlflowClient
import mlflavors
import mlforecast.flavor

from statsforecast import StatsForecast
from statsforecast.models import ARCH, GARCH
from mlforecast import MLForecast
from mlforecast.utils import PredictionIntervals
from datasetsforecast import losses
from utilsforecast.plotting import plot_series

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import lightgbm as lgb

# Подключение к MLFlow

In [4]:
mlflow.set_tracking_uri("http://mlflow:5000")

In [5]:
mlflow.set_experiment("btc-usdt_volatility_experiment")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1727343300546, experiment_id='2', last_update_time=1727343300546, lifecycle_stage='active', name='btc-usdt_volatility_experiment', tags={}>

# Проведение экспериментов 

## Подготовка данных

In [211]:
# Биржа из которой будут браться данные с помощью CCXT
EXCHANGE = ccxt.okx()
# Инструмент в формате символа для обработки
SYMBOL = "BTC/USDT"
# Таймфрейм свеч
TIMEFRAME = "1h"

In [212]:
from_ts = EXCHANGE.parse8601('2024-01-10 00:00:00')

ohlcv_list = []
ohlcv = EXCHANGE.fetch_ohlcv(symbol=SYMBOL, timeframe=TIMEFRAME, since=from_ts, limit=100)
ohlcv_list.append(ohlcv)

while True:
    from_ts = ohlcv[-1][0]
    new_ohlcv = EXCHANGE.fetch_ohlcv(symbol=SYMBOL, timeframe=TIMEFRAME, since=from_ts, limit=100)
    ohlcv.extend(new_ohlcv)

    print(f"\r{EXCHANGE.iso8601(from_ts)}", end="")
    
    if len(new_ohlcv) <= 1:
    	break

2024-09-27T11:00:00.000Z

In [213]:
ohlcv = pd.DataFrame(ohlcv, columns=["date", "open", "high", "low", "close", "volume"])
ohlcv["date"] = ohlcv["date"].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000))
ohlcv["close_pct_change"] = ohlcv["close"].pct_change()
ohlcv['close_pct_change'] = ohlcv['close_pct_change'].fillna(0)
ohlcv.reset_index(drop=True)

Unnamed: 0,date,open,high,low,close,volume,close_pct_change
0,2024-01-10 00:00:00,46109.4,46235.5,45768.3,45854.1,1198.592272,0.000000
1,2024-01-10 01:00:00,45853.9,45955.6,45617.5,45925.8,1463.003867,0.001564
2,2024-01-10 02:00:00,45925.8,46083.0,45878.2,45970.8,640.245350,0.000980
3,2024-01-10 03:00:00,45978.0,46209.9,45923.3,46119.9,359.281616,0.003243
4,2024-01-10 04:00:00,46119.9,46180.9,45941.2,45961.9,380.550457,-0.003426
...,...,...,...,...,...,...,...
6335,2024-09-27 08:00:00,65464.7,65798.9,65417.0,65731.9,340.863833,0.004082
6336,2024-09-27 09:00:00,65732.0,65997.2,65662.8,65746.8,474.897545,0.000227
6337,2024-09-27 10:00:00,65746.3,65781.1,65488.0,65488.0,249.032124,-0.003936
6338,2024-09-27 11:00:00,65488.1,65616.3,65342.0,65389.5,142.224558,-0.001504


In [214]:
ohlcv

Unnamed: 0,date,open,high,low,close,volume,close_pct_change
0,2024-01-10 00:00:00,46109.4,46235.5,45768.3,45854.1,1198.592272,0.000000
1,2024-01-10 01:00:00,45853.9,45955.6,45617.5,45925.8,1463.003867,0.001564
2,2024-01-10 02:00:00,45925.8,46083.0,45878.2,45970.8,640.245350,0.000980
3,2024-01-10 03:00:00,45978.0,46209.9,45923.3,46119.9,359.281616,0.003243
4,2024-01-10 04:00:00,46119.9,46180.9,45941.2,45961.9,380.550457,-0.003426
...,...,...,...,...,...,...,...
6335,2024-09-27 08:00:00,65464.7,65798.9,65417.0,65731.9,340.863833,0.004082
6336,2024-09-27 09:00:00,65732.0,65997.2,65662.8,65746.8,474.897545,0.000227
6337,2024-09-27 10:00:00,65746.3,65781.1,65488.0,65488.0,249.032124,-0.003936
6338,2024-09-27 11:00:00,65488.1,65616.3,65342.0,65389.5,142.224558,-0.001504


In [215]:
ohlcv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6340 entries, 0 to 6339
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              6340 non-null   datetime64[ns]
 1   open              6340 non-null   float64       
 2   high              6340 non-null   float64       
 3   low               6340 non-null   float64       
 4   close             6340 non-null   float64       
 5   volume            6340 non-null   float64       
 6   close_pct_change  6340 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 346.8 KB


In [216]:
ohlcv.shape

(6340, 7)

In [217]:
ohlcv.nunique()

date                6276
open                6132
high                6023
low                 5997
close               6120
volume              6277
close_pct_change    6275
dtype: int64

In [221]:
ohlcv["date"].astype(int)

0       1704844800000000000
1       1704848400000000000
2       1704852000000000000
3       1704855600000000000
4       1704859200000000000
               ...         
6335    1727424000000000000
6336    1727427600000000000
6337    1727431200000000000
6338    1727434800000000000
6339    1727434800000000000
Name: date, Length: 6340, dtype: int64

В данных могут появится дубликаты, от них надо избавлятся

In [224]:
ohlcv = ohlcv.drop_duplicates(subset=['date'], keep='last')

In [225]:
ohlcv

Unnamed: 0,date,open,high,low,close,volume,close_pct_change
0,2024-01-10 00:00:00,46109.4,46235.5,45768.3,45854.1,1198.592272,0.000000
1,2024-01-10 01:00:00,45853.9,45955.6,45617.5,45925.8,1463.003867,0.001564
2,2024-01-10 02:00:00,45925.8,46083.0,45878.2,45970.8,640.245350,0.000980
3,2024-01-10 03:00:00,45978.0,46209.9,45923.3,46119.9,359.281616,0.003243
4,2024-01-10 04:00:00,46119.9,46180.9,45941.2,45961.9,380.550457,-0.003426
...,...,...,...,...,...,...,...
6334,2024-09-27 07:00:00,65344.0,65546.5,65260.0,65464.7,592.119167,0.001849
6335,2024-09-27 08:00:00,65464.7,65798.9,65417.0,65731.9,340.863833,0.004082
6336,2024-09-27 09:00:00,65732.0,65997.2,65662.8,65746.8,474.897545,0.000227
6337,2024-09-27 10:00:00,65746.3,65781.1,65488.0,65488.0,249.032124,-0.003936


In [226]:
train_df = pd.DataFrame(
    columns = ["ds", "y", "unique_id"]   
)

train_df["ds"] = ohlcv["date"].iloc[-324:-24]
train_df["y"] = ohlcv["close_pct_change"].iloc[-324:-24]
train_df["unique_id"] = 1
train_df = train_df.reset_index(drop=True)

In [227]:
train_df

Unnamed: 0,ds,y,unique_id
0,2024-09-14 00:00:00,-0.001550,1
1,2024-09-14 01:00:00,-0.000061,1
2,2024-09-14 02:00:00,-0.001869,1
3,2024-09-14 03:00:00,0.000650,1
4,2024-09-14 04:00:00,-0.001765,1
...,...,...,...
295,2024-09-26 07:00:00,0.002040,1
296,2024-09-26 08:00:00,-0.002106,1
297,2024-09-26 09:00:00,0.001256,1
298,2024-09-26 10:00:00,0.009356,1


In [228]:
test_df = pd.DataFrame(
    columns = ["ds", "y", "unique_id"]   
)

test_df["ds"] = ohlcv["date"].iloc[-24:]
test_df["y"] = ohlcv["close_pct_change"].iloc[-24:]
test_df["unique_id"] = 1
test_df = test_df.reset_index(drop=True)

In [229]:
test_df

Unnamed: 0,ds,y,unique_id
0,2024-09-26 12:00:00,0.001298,1
1,2024-09-26 13:00:00,-0.000771,1
2,2024-09-26 14:00:00,-4.5e-05,1
3,2024-09-26 15:00:00,0.01244,1
4,2024-09-26 16:00:00,0.001646,1
5,2024-09-26 17:00:00,-0.003295,1
6,2024-09-26 18:00:00,0.002441,1
7,2024-09-26 19:00:00,-0.008975,1
8,2024-09-26 20:00:00,-0.000745,1
9,2024-09-26 21:00:00,0.006093,1


## MLFlow run

### ARCH

In [251]:
def make_arch_run(**kwargs):    
    with mlflow.start_run(run_name=f'ARCH_{str(datetime.datetime.now())}') as run:
        # Лучшие параметры полученные в исследовании
        params = {
            "p": kwargs["p"], 
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "ARCH")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = StatsForecast(
            models=[ARCH(**params)],
            freq='h',
            n_jobs=-1
        )

        # Обучение моедли
        model.fit(train_df)

        # Прогнозирование для test датасета
        forecasts = model.forecast(48, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values[:24])

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlflavors.statsforecast.log_model(
            statsforecast_model=model,
            artifact_path="arch",
            serialization_format="pickle",
        )
        # Сохранение визуализации
        fig = plot_series(
            pd.concat([train_df, test_df]), 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [252]:
make_arch_run(
    p=84
)

2024/09/27 12:18:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run ARCH_2024-09-27 12:18:38.329973 at: http://mlflow:5000/#/experiments/2/runs/24c473a9673c4ea9a684532b43e39273.
2024/09/27 12:18:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### GARCH

In [253]:
def make_garch_run(**kwargs):    
    with mlflow.start_run(run_name=f'GARCH_{str(datetime.datetime.now())}') as run:
        # Лучшие параметры полученные в исследовании
        params = {
            "p": kwargs["p"], 
            "q": kwargs["q"]
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "GARCH")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = StatsForecast(
            models=[GARCH(**params)],
            freq='h',
            n_jobs=-1
        )

        # Обучение моедли
        model.fit(train_df)

        # Прогнозирование для test датасета
        forecasts = model.forecast(48, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values[:24])

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlflavors.statsforecast.log_model(
            statsforecast_model=model,
            artifact_path="garch",
            serialization_format="pickle",
        )
        # Сохранение визуализации
        fig = plot_series(
            pd.concat([train_df, test_df]), 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [254]:
make_garch_run(
    p=92, 
    q=24
)

2024/09/27 12:18:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run GARCH_2024-09-27 12:18:41.594395 at: http://mlflow:5000/#/experiments/2/runs/a1fd9c07c3a94aada08d3f4ab4ac2e4b.
2024/09/27 12:18:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### SVR

In [255]:
def make_svr_run(**kwargs):    
    with mlflow.start_run(run_name=f'SVR_{str(datetime.datetime.now())}') as run:
        # Лучшие параметры полученные в исследовании
        params = {
            "kernel": kwargs["kernel"], 
            "degree": kwargs["degree"],
            "C": kwargs["C"]
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "SVR")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = MLForecast(
            models=[SVR(**params)],
            freq='h',  
            lags=list(range(1, 24, 1)),
        )
        
        # Обучение моедли
        model.fit(train_df, prediction_intervals=PredictionIntervals(n_windows=2, h=48))

        # Прогнозирование для test датасета
        #forecasts = model.predict(24, new_df=test_df)
        forecasts = model.predict(48, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values[:24])

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlforecast.flavor.log_model(
            model=model,
            artifact_path="scr",
        )
        # Сохранение визуализации
        fig = plot_series(
            pd.concat([train_df, test_df]), 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [256]:
make_svr_run(
    kernel="poly", 
    degree=3,
    C=0.9702637495163653
)

2024/09/27 12:18:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVR_2024-09-27 12:18:44.879510 at: http://mlflow:5000/#/experiments/2/runs/4172747f469b4baea2e56ebb92f59201.
2024/09/27 12:18:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### LGBMRegressor

In [257]:
def make_lightgbm_run(**kwargs):    
    with mlflow.start_run(run_name=f'LightGBM_{str(datetime.datetime.now())}') as run:
        # Лучшие параметры полученные в исследовании
        params = {
            "n_estimators": kwargs["n_estimators"], 
            "boosting_type": kwargs["boosting_type"],
            "num_leaves": kwargs["num_leaves"],
            "max_depth": kwargs["max_depth"],
            "learning_rate": kwargs["learning_rate"],
            "verbose": -1
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "LightGBM")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = MLForecast(
            models=[lgb.LGBMRegressor(**params)],
            freq='h',  
            lags=list(range(1, 17, 1)),
        )

        # Обучение моедли
        model.fit(train_df, prediction_intervals=PredictionIntervals(n_windows=2, h=48))

        # Прогнозирование для test датасета
        #forecasts = model.predict(24, new_df=test_df)
        forecasts = model.predict(48, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values[:24])

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlforecast.flavor.log_model(
            model=model,
            artifact_path="lightgbm",
        )
        # Сохранение визуализации
        fig = plot_series(
            pd.concat([train_df, test_df]), 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [258]:
make_lightgbm_run(
    n_estimators=382, 
    boosting_type="dart", 
    num_leaves=93, 
    max_depth=7, 
    learning_rate=0.04001572844964948
)

2024/09/27 12:18:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run LightGBM_2024-09-27 12:18:50.295472 at: http://mlflow:5000/#/experiments/2/runs/175de804e21e4f97b573b1b068269eae.
2024/09/27 12:18:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### KNN

In [259]:
def make_knn_run(**kwargs):    
    with mlflow.start_run(run_name=f'KNN_{str(datetime.datetime.now())}') as run:
        # Лучшие параметры полученные в исследовании
        params = {
            "n_neighbors": kwargs["n_neighbors"], 
            "weights": kwargs["weights"],
            "leaf_size": kwargs["leaf_size"]
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "KNN")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = MLForecast(
            models=[KNeighborsRegressor(**params)],
            freq='h',  
            lags=list(range(1, 24, 1)),
        )

        # Обучение моедли
        model.fit(train_df, prediction_intervals=PredictionIntervals(n_windows=2, h=48))

        # Прогнозирование для test датасета
        #forecasts = model.predict(24, new_df=test_df)
        forecasts = model.predict(48, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values[:24])
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values[:24])

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlforecast.flavor.log_model(
            model=model,
            artifact_path="knn",
        )
        # Сохранение визуализации
        fig = plot_series(
            pd.concat([train_df, test_df]), 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [260]:
make_knn_run(
    n_neighbors=27, 
    weights='uniform', 
    leaf_size=34
)

2024/09/27 12:19:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_2024-09-27 12:18:55.937902 at: http://mlflow:5000/#/experiments/2/runs/3ad148774ff7465593a535d03866ae24.
2024/09/27 12:19:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


# P.s.

MLFlow load model не работает