# Модель машинного обучения соревнования на kaggle при помощи catboost
Regression with an Abalone Dataset
## Цели
- подбор параметров

In [1]:
import pandas as pd
import optuna

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [2]:
# загрузка даных
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')

df_abalone = pd.read_csv('data/abalone.csv')

## Подготовка данных

In [3]:
# привожу название признаков к единому виду
df_abalone.rename(columns={'Shucked weight': 'Whole weight.1',
                           'Viscera weight': 'Whole weight.2'}, inplace=True)

In [4]:
# удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

In [5]:
# привожу признак Sex к категориальному типу
df['Sex'] = df['Sex'].astype('category')
df_pred['Sex'] = df_pred['Sex'].astype('category')
df_abalone['Sex'] = df_abalone['Sex'].astype('category')

In [6]:
df_all = pd.concat([df, df_abalone], ignore_index=True)

## Поиск гиперпараметров

In [7]:
X = df_all.drop(columns='Rings')
y = df_all['Rings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
# def objective(trial: optuna.Trial):
#     '''
#     Обычная сигнатура функции оптимизации для optuna.
#     '''
#     params = {
#         'random_seed': 42,
#         'cat_features': ['Sex'],
#         'verbose': 0,
#         'eval_metric': 'MSLE',
#         'iterations': trial.suggest_int('iterations', 1000, 2100),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#         'depth': trial.suggest_int('depth', 3, 12),
#         # 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 100.0, log=True),
#         # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
#         # 'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True)
#     }
    
#     model = CatBoostRegressor(**params)
#     model.fit(X_train, y_train, eval_set=(X_test, y_test))
#     predictions = model.predict(X_test)
#     return mean_squared_log_error(y_test, predictions)
    
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

In [9]:
best_params = {'iterations': 1683,
               'learning_rate': 0.08535277866932216,
               'depth': 7,
               'random_seed': 42,
               'cat_features': ['Sex'],
               'verbose': 0,
               'eval_metric': 'MSLE'
              }

## Модель
- kaggle public Score: 0.14797

In [10]:
cat_model = CatBoostRegressor(**best_params)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = cat_model.predict(X_test)
rmsle = mean_squared_log_error(y_test, y_pred)
pred = cat_model.predict(df_pred)
rmsle

0.022276684889506508

In [11]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'Rings': pred})
output.to_csv('data/cat_model.csv', index=False)