# Модель машинного обучения соревнования на kaggle при помощи lgbm
Regression with an Abalone Dataset
## Цели
- подбор параметров

In [1]:
import pandas as pd
import optuna

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [2]:
# загрузка даных
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')

df_abalone = pd.read_csv('data/abalone.csv')

## Подготовка данных

In [3]:
# привожу название признаков к единому виду
df_abalone.rename(columns={'Shucked weight': 'Whole weight.1',
                           'Viscera weight': 'Whole weight.2'}, inplace=True)

In [4]:
# удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

In [5]:
# привожу признак Sex к категориальному типу
df['Sex'] = df['Sex'].astype('category')
df_pred['Sex'] = df_pred['Sex'].astype('category')
df_abalone['Sex'] = df_abalone['Sex'].astype('category')

In [6]:
df_all = pd.concat([df, df_abalone], ignore_index=True)

In [7]:
# убераю пробелы в названиях столбцов
df_all.rename(columns={'Whole weight.1': 'Whole_weight.1',
                       'Whole weight.2': 'Whole_weight.2',
                       'Whole weight': 'Whole_weight',
                       'Shell weight': 'Shell_weight'}, inplace=True)

## Поиск гиперпараметров

In [8]:
X = df_all.drop(columns='Rings')
y = df_all['Rings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
# def objective(trial: optuna.Trial):
#     '''
#     Обычная сигнатура функции оптимизации для optuna.
#     '''
#     param = {
#             "metric": "'MSLE'",
#             "verbosity": -1,
#             "boosting_type": "gbdt",
#             "random_state": 42,
#             "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
#             "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#             'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
#             'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
#             "max_depth": trial.suggest_int("max_depth", 5, 25),
# #            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
# #            "subsample": trial.suggest_float("subsample", 0.8, 1.0),
#             "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
#             'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
#             'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0)}
    
#     lgbm_model = LGBMRegressor(**param)
#     lgbm_model.fit(X_train, y_train, eval_set=(X_test, y_test))
#     y_pred = lgbm_model.predict(X_test)
#     return mean_squared_log_error(y_test, y_pred)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=300)

In [10]:
best_params = {"metric": "'MSLE'",
               "verbosity": -1,
               "boosting_type": "gbdt",
               "random_state": 42,
               'learning_rate': 0.06276250716970622,
               'n_estimators': 882,
               'lambda_l1': 9.350836598168922e-06,
               'lambda_l2': 0.8938597940442873,
               'max_depth': 6,
               'min_child_samples': 42,
               'feature_fraction': 0.6175720776491681,
               'bagging_fraction': 0.6778611980758826}

## Модель
- kaggle public Score: 0.14785

In [11]:
lgbm_model = LGBMRegressor(**best_params)
lgbm_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = lgbm_model.predict(X_test)
rmsle = mean_squared_log_error(y_test, y_pred)
pred = lgbm_model.predict(df_pred)
rmsle

0.022142334111083718

In [12]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'Rings': pred})
output.to_csv('data/lgbm_model.csv', index=False)