In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. Выбираем нужные библиотеки

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
import catboost
import optuna
from optuna.visualization import plot_optimization_history
from temperature import TemperatureCounting
from data import DataReader
from bert import BERTTransform

# 1. Преобразование данных тренировочного и тестового датасета

In [25]:
temperature = TemperatureCounting()
data_reader = DataReader()
bert_vector = BERTTransform()

In [26]:
train_data = data_reader.read('train_dataset.csv')
train_data = train_data.dropna()
train_data.head()

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact
0,2019-01-01,0,481.51,2.9,2.0,"пасм, ветер",ветер
1,2019-01-01,1,462.872,2.9,2.0,"пасм, ветер",ветер
2,2019-01-01,2,449.718,2.9,2.0,"пасм, ветер",ветер
3,2019-01-01,3,430.908,4.3,2.0,"пасм, ветер","ветер, пасм"
4,2019-01-01,4,415.163,4.3,2.0,"пасм, ветер","ветер, пасм"


In [27]:
test_data = data_reader.read('test_dataset.csv')
test_data = test_data.dropna()
test_data.head()

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact
0,2023-04-01,0,479.282,5.7,5.0,"пасм, 58%","пасм, морось"
1,2023-04-01,1,445.182,5.7,5.0,"пасм, 58%","пасм, морось"
2,2023-04-01,2,424.225,5.7,5.0,"пасм, 58%","пасм, морось"
3,2023-04-01,3,413.866,5.0,4.0,"пасм, 71% дождь","пасм, дымка"
4,2023-04-01,4,408.146,5.0,4.0,"пасм, 71% дождь","пасм, дымка"


In [28]:
train_data = temperature.transform(train_data)
test_data = temperature.transform(test_data)

In [29]:
train_data = data_reader.transform(train_data)
test_data = data_reader.transform(test_data)

In [30]:
train_data = bert_vector.transform(train_data)
test_data = bert_vector.transform(test_data)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.pr

In [31]:
x, y = train_data.drop(columns=['target']), train_data[['target']]

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.4)

In [35]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(17824, 14) (17824, 1)
(7427, 14) (7427, 1)
(11884, 14) (11884, 1)


# 2. Создание модели

In [36]:
def objective(trial):
    param = {
        "objective": "MAE",
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
        "embedding_features": [3, 4]
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = catboost.CatBoostRegressor(**param)

    gbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=0)

    y_pred = gbm.predict(x_val)
    score = mean_absolute_error(y_val, y_pred)
    return score

In [37]:
optuna.logging.set_verbosity(0)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000, n_jobs=-1, show_progress_bar=True)
plot_optimization_history(study)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1000 [00:00<?, ?it/s]

# 3. Обучение модели и замер необходимых метрик

In [39]:
gbm = catboost.CatBoostRegressor(**study.best_params, embedding_features=[3, 4])
gbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=0)

<catboost.core.CatBoostRegressor at 0x11058b490>

In [43]:
y_pred = gbm.predict(x_test)
print('=========== Scores on test: =============')

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R2-score: {r2_score(y_test, y_pred)}')

MAE: 30.536510452444276
MAPE: 0.06658497839568009
R2-score: 0.8616353981201048


# 4. Получение прогнозов

In [46]:
test_data = test_data.drop(['target'], axis=1)

In [47]:
y_pred = gbm.predict(test_data)
y_pred

array([483.8151758 , 483.8151758 , 483.8151758 , ..., 485.42780939,
       485.42780939, 485.42780939])