# Подключение зависимостей

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import ccxt
from IPython.display import clear_output, display

#sns.set_theme()
%matplotlib inline

In [170]:
import mlflow
from mlflow.tracking import MlflowClient
import mlflavors
import mlforecast.flavor

from statsforecast import StatsForecast
from statsforecast.models import ARCH, GARCH
from mlforecast import MLForecast
from mlforecast.utils import PredictionIntervals
from datasetsforecast import losses
from utilsforecast.plotting import plot_series

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import lightgbm as lgb

# Подключение к MLFlow

In [5]:
mlflow.set_tracking_uri("http://mlflow:5000")

In [6]:
mlflow.set_experiment("btc-usdt_volatility_experiment")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1727343300546, experiment_id='2', last_update_time=1727343300546, lifecycle_stage='active', name='btc-usdt_volatility_experiment', tags={}>

# Проведение экспериментов 

## Подготовка данных

In [7]:
# Биржа из которой будут браться данные с помощью CCXT
EXCHANGE = ccxt.okx()
# Инструмент в формате символа для обработки
SYMBOL = "BTC/USDT"
# Таймфрейм свеч
TIMEFRAME = "1h"

In [8]:
from_ts = EXCHANGE.parse8601('2024-01-10 00:00:00')

ohlcv_list = []
ohlcv = EXCHANGE.fetch_ohlcv(symbol=SYMBOL, timeframe=TIMEFRAME, since=from_ts, limit=100)
ohlcv_list.append(ohlcv)

while True:
    from_ts = ohlcv[-1][0]
    new_ohlcv = EXCHANGE.fetch_ohlcv(symbol=SYMBOL, timeframe=TIMEFRAME, since=from_ts, limit=100)
    ohlcv.extend(new_ohlcv)

    print(f"\r{EXCHANGE.iso8601(from_ts)}", end="")
    
    if len(new_ohlcv) <= 1:
    	break

2024-09-26T10:00:00.000Z

In [9]:
ohlcv = pd.DataFrame(ohlcv, columns=["date", "open", "high", "low", "close", "volume"])
ohlcv["date"] = ohlcv["date"].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000))
ohlcv["close_pct_change"] = ohlcv["close"].pct_change()
ohlcv['close_pct_change'] = ohlcv['close_pct_change'].fillna(0)
ohlcv.reset_index(drop=True)

Unnamed: 0,date,open,high,low,close,volume,close_pct_change
0,2024-01-10 00:00:00,46109.4,46235.5,45768.3,45854.1,1198.592272,0.000000
1,2024-01-10 01:00:00,45853.9,45955.6,45617.5,45925.8,1463.003867,0.001564
2,2024-01-10 02:00:00,45925.8,46083.0,45878.2,45970.8,640.245350,0.000980
3,2024-01-10 03:00:00,45978.0,46209.9,45923.3,46119.9,359.281616,0.003243
4,2024-01-10 04:00:00,46119.9,46180.9,45941.2,45961.9,380.550457,-0.003426
...,...,...,...,...,...,...,...
6310,2024-09-26 07:00:00,63722.4,63874.0,63664.0,63852.4,278.732325,0.002040
6311,2024-09-26 08:00:00,63852.5,63892.1,63699.6,63717.9,270.062767,-0.002106
6312,2024-09-26 09:00:00,63714.5,63970.0,63590.0,63797.9,352.797999,0.001256
6313,2024-09-26 10:00:00,63798.0,63816.0,63762.0,63780.0,15.999053,-0.000281


In [10]:
ohlcv

Unnamed: 0,date,open,high,low,close,volume,close_pct_change
0,2024-01-10 00:00:00,46109.4,46235.5,45768.3,45854.1,1198.592272,0.000000
1,2024-01-10 01:00:00,45853.9,45955.6,45617.5,45925.8,1463.003867,0.001564
2,2024-01-10 02:00:00,45925.8,46083.0,45878.2,45970.8,640.245350,0.000980
3,2024-01-10 03:00:00,45978.0,46209.9,45923.3,46119.9,359.281616,0.003243
4,2024-01-10 04:00:00,46119.9,46180.9,45941.2,45961.9,380.550457,-0.003426
...,...,...,...,...,...,...,...
6310,2024-09-26 07:00:00,63722.4,63874.0,63664.0,63852.4,278.732325,0.002040
6311,2024-09-26 08:00:00,63852.5,63892.1,63699.6,63717.9,270.062767,-0.002106
6312,2024-09-26 09:00:00,63714.5,63970.0,63590.0,63797.9,352.797999,0.001256
6313,2024-09-26 10:00:00,63798.0,63816.0,63762.0,63780.0,15.999053,-0.000281


In [11]:
ohlcv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6315 entries, 0 to 6314
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              6315 non-null   datetime64[ns]
 1   open              6315 non-null   float64       
 2   high              6315 non-null   float64       
 3   low               6315 non-null   float64       
 4   close             6315 non-null   float64       
 5   volume            6315 non-null   float64       
 6   close_pct_change  6315 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 345.5 KB


In [12]:
ohlcv = ohlcv.iloc[-380:]

In [13]:
ohlcv

Unnamed: 0,date,open,high,low,close,volume,close_pct_change
5935,2024-09-10 20:00:00,57952.1,57980.6,57549.9,57570.0,167.548277,-0.006587
5936,2024-09-10 21:00:00,57570.0,57757.9,57460.2,57516.0,87.054641,-0.000938
5937,2024-09-10 22:00:00,57518.1,57691.6,57450.0,57613.5,140.808569,0.001695
5938,2024-09-10 23:00:00,57612.0,57870.9,57582.3,57639.7,108.268349,0.000455
5939,2024-09-11 00:00:00,57639.8,57688.8,57439.2,57633.9,212.541954,-0.000101
...,...,...,...,...,...,...,...
6310,2024-09-26 07:00:00,63722.4,63874.0,63664.0,63852.4,278.732325,0.002040
6311,2024-09-26 08:00:00,63852.5,63892.1,63699.6,63717.9,270.062767,-0.002106
6312,2024-09-26 09:00:00,63714.5,63970.0,63590.0,63797.9,352.797999,0.001256
6313,2024-09-26 10:00:00,63798.0,63816.0,63762.0,63780.0,15.999053,-0.000281


In [14]:
train_df = pd.DataFrame(
    columns = ["ds", "y", "unique_id"]   
)

train_df["ds"] = ohlcv["date"].iloc[-324:-24]
train_df["y"] = ohlcv["close_pct_change"].iloc[-324:-24]
train_df["unique_id"] = 1

In [15]:
train_df

Unnamed: 0,ds,y,unique_id
5991,2024-09-13 04:00:00,-0.000118,1
5992,2024-09-13 05:00:00,0.001727,1
5993,2024-09-13 06:00:00,-0.003089,1
5994,2024-09-13 07:00:00,0.006089,1
5995,2024-09-13 08:00:00,-0.002686,1
...,...,...,...
6286,2024-09-25 08:00:00,-0.000406,1
6287,2024-09-25 09:00:00,-0.003306,1
6288,2024-09-25 10:00:00,0.000344,1
6289,2024-09-25 11:00:00,0.003174,1


In [16]:
test_df = pd.DataFrame(
    columns = ["ds", "y", "unique_id"]   
)

test_df["ds"] = ohlcv["date"].iloc[-24:]
test_df["y"] = ohlcv["close_pct_change"].iloc[-24:]
test_df["unique_id"] = 1

In [17]:
test_df

Unnamed: 0,ds,y,unique_id
6291,2024-09-25 13:00:00,0.002547,1
6292,2024-09-25 14:00:00,0.002823,1
6293,2024-09-25 15:00:00,-0.006866,1
6294,2024-09-25 16:00:00,-0.000584,1
6295,2024-09-25 17:00:00,-0.004637,1
6296,2024-09-25 18:00:00,0.002922,1
6297,2024-09-25 19:00:00,-0.002597,1
6298,2024-09-25 20:00:00,0.005085,1
6299,2024-09-25 21:00:00,-0.001088,1
6300,2024-09-25 21:00:00,0.0,1


## MLFlow run

### ARCH

In [123]:
def make_arch_run(**kwargs):    
    with mlflow.start_run(run_name=f'ARCH_{str(datetime.datetime.now())}') as run:
        # Создание, обучение и валидация catboost модели
        params = {
            "p": kwargs["p"], 
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "ARCH")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = StatsForecast(
            models=[ARCH(**params)],
            freq='h',
            n_jobs=-1
        )

        # Обучение моедли
        model.fit(train_df)

        # Прогнозирование для test датасета
        forecasts = model.forecast(24, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values)
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values)

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlflavors.statsforecast.log_model(
            statsforecast_model=model,
            artifact_path="arch",
            serialization_format="pickle",
        )
        # Сохранение визуализации
        fig = plot_series(
            train_df, 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [124]:
make_arch_run(
    p=84
)

2024/09/26 18:04:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run ARCH_2024-09-26 18:04:25.524553 at: http://mlflow:5000/#/experiments/2/runs/95144dc0c08a40409efb2da452d71a74.
2024/09/26 18:04:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### GARCH

In [125]:
def make_garch_run(**kwargs):    
    with mlflow.start_run(run_name=f'GARCH_{str(datetime.datetime.now())}') as run:
        # Создание, обучение и валидация catboost модели
        params = {
            "p": kwargs["p"], 
            "q": kwargs["q"]
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "GARCH")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = StatsForecast(
            models=[GARCH(**params)],
            freq='h',
            n_jobs=-1
        )

        # Обучение моедли
        model.fit(train_df)

        # Прогнозирование для test датасета
        forecasts = model.forecast(24, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values)
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values)

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlflavors.statsforecast.log_model(
            statsforecast_model=model,
            artifact_path="garch",
            serialization_format="pickle",
        )
        # Сохранение визуализации
        fig = plot_series(
            train_df, 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [126]:
make_garch_run(
    p=92, 
    q=24
)

2024/09/26 18:04:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run GARCH_2024-09-26 18:04:28.733052 at: http://mlflow:5000/#/experiments/2/runs/f20bd47bca164760ba824be1f88fc35e.
2024/09/26 18:04:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### SVR

In [176]:
def make_svr_run(**kwargs):    
    with mlflow.start_run(run_name=f'SVR_{str(datetime.datetime.now())}') as run:
        # Создание, обучение и валидация catboost модели
        params = {
            "kernel": kwargs["kernel"], 
            "degree": kwargs["degree"],
            "C": kwargs["C"]
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "SVR")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = MLForecast(
            models=[SVR(**params)],
            freq='h',  
            lags=list(range(1, 24, 1)),
        )
        
        # Обучение моедли
        model.fit(train_df, prediction_intervals=PredictionIntervals(n_windows=2, h=24))

        # Прогнозирование для test датасета
        #forecasts = model.predict(24, new_df=test_df)
        forecasts = model.predict(24, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values)
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values)

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlforecast.flavor.log_model(
            model=model,
            artifact_path="scr",
        )
        # Сохранение визуализации
        fig = plot_series(
            train_df, 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [177]:
make_svr_run(
    kernel="poly", 
    degree=3,
    C=0.9702637495163653
)

2024/09/26 18:26:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVR_2024-09-26 18:26:12.499160 at: http://mlflow:5000/#/experiments/2/runs/5c127442c4714c769cad02bfcd34f597.
2024/09/26 18:26:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### LGBMRegressor

In [178]:
def make_lightgbm_run(**kwargs):    
    with mlflow.start_run(run_name=f'LightGBM_{str(datetime.datetime.now())}') as run:
        # Создание, обучение и валидация catboost модели
        params = {
            "n_estimators": kwargs["n_estimators"], 
            "boosting_type": kwargs["boosting_type"],
            "num_leaves": kwargs["num_leaves"],
            "max_depth": kwargs["max_depth"],
            "learning_rate": kwargs["learning_rate"],
            "verbose": -1
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "LightGBM")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = MLForecast(
            models=[lgb.LGBMRegressor(**params)],
            freq='h',  
            lags=list(range(1, 17, 1)),
        )

        # Обучение моедли
        model.fit(train_df, prediction_intervals=PredictionIntervals(n_windows=2, h=24))

        # Прогнозирование для test датасета
        #forecasts = model.predict(24, new_df=test_df)
        forecasts = model.predict(24, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values)
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values)

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlforecast.flavor.log_model(
            model=model,
            artifact_path="lightgbm",
        )
        # Сохранение визуализации
        fig = plot_series(
            train_df, 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [179]:
make_lightgbm_run(
    n_estimators=382, 
    boosting_type="dart", 
    num_leaves=93, 
    max_depth=7, 
    learning_rate=0.04001572844964948
)

2024/09/26 18:26:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run LightGBM_2024-09-26 18:26:24.427914 at: http://mlflow:5000/#/experiments/2/runs/b576f22495ea44869c9ef09a918ffddd.
2024/09/26 18:26:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


### KNN

In [180]:
def make_knn_run(**kwargs):    
    with mlflow.start_run(run_name=f'KNN_{str(datetime.datetime.now())}') as run:
        # Создание, обучение и валидация catboost модели
        params = {
            "n_neighbors": kwargs["n_neighbors"], 
            "weights": kwargs["weights"],
            "leaf_size": kwargs["leaf_size"]
        }

        # Сохранение тегов
        mlflow.set_tag("model_name", "KNN")
        mlflow.set_tag("model_type", "regression")
        # Сохранение параметров
        mlflow.log_params(params)

        # Создание модели
        model = MLForecast(
            models=[KNeighborsRegressor(**params)],
            freq='h',  
            lags=list(range(1, 24, 1)),
        )

        # Обучение моедли
        model.fit(train_df, prediction_intervals=PredictionIntervals(n_windows=2, h=24))

        # Прогнозирование для test датасета
        #forecasts = model.predict(24, new_df=test_df)
        forecasts = model.predict(24, level=[95, 90])
        forecasts["unique_id"] = 1

        # Рассчёт метрик
        rmse = losses.rmse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mse = losses.mse(test_df["y"].values, forecasts.iloc[:, 2].values)
        mae = losses.mae(test_df["y"].values, forecasts.iloc[:, 2].values)
        smape = losses.smape(test_df["y"].values, forecasts.iloc[:, 2].values)

        # Сохранение метрик
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("SMAPE", smape)
        # Сохранение модели
        mlforecast.flavor.log_model(
            model=model,
            artifact_path="knn",
        )
        # Сохранение визуализации
        fig = plot_series(
            train_df, 
            forecasts_df=forecasts,
            engine='matplotlib',
            level=[95, 90],
        )
        fig.savefig('forecast.png', bbox_inches='tight')
        plt.close()
        mlflow.log_artifact("forecast.png", "forecast")

In [181]:
make_knn_run(
    n_neighbors=27, 
    weights='uniform', 
    leaf_size=34
)

2024/09/26 18:26:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_2024-09-26 18:26:29.977394 at: http://mlflow:5000/#/experiments/2/runs/ae4655cc98494ebc9ad3c719a9848e63.
2024/09/26 18:26:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/2.


# P.s.

MLFlow load model не работает