In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import optuna

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score

In [9]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train_extra = pd.read_csv('data/training_extra.csv')
# sample = pd.read_csv('data/sample_submission.csv')

# сохраняю id
test_id = test['id']

# Таблица оценок

| Model | My_score | Kaggle_score | description | data |
|:------|:--------:|:------------:|:------------|:-----|
|cat|39.08416|39.22710|Параметры по умолчанию |train|
|lgbm|39.02345|39.14851|Параметры по умолчанию|train|
|xgbr|39.19209|39.31416|Параметры по умолчанию|train|
|cat|38.85208|39.11796|Параметры по умолчанию |train_extra|
|lgbm|38.84959|39.10425|Параметры по умолчанию|train_extra|
|xgbr|38.85852|39.10651|Параметры по умолчанию|train_extra|
|cat|38.99891|39.12738|study_params_0|train|
|cat|38.86182|39.11650|study_params_0|train_extra|
|cat|38.89254|39.11432|Параметры по умолчанию|train + train_extra|
|lgbm|38.88724|39.10287|Параметры по умолчанию|train + train_extra|
|xgbr|38.89985|39.10845|Параметры по умолчанию|train + train_extra|
|lgbm|39.00286|39.12981|l_study_params_0|train|

# catboos

In [10]:
# Переменная категориальных признаков
class_features = ['Brand', 'Material', 'Size', 'Style', 'Color', 'Laptop Compartment', 'Waterproof']

def cat_category(df):
    '''
    Функция заменяет Nan значения на 'Unknown' и приводит к типу 'category' категориальные признаки
    '''
    df[class_features] = df[class_features].fillna('Unknown')
    df[class_features] = df[class_features].astype('category')
    return df
    

def cat_weight(df):
    '''
    Функция создает новый категориальный признак из "Weight Capacity (kg)"
    приводит к типу "category"
    '''
    conditions = [
        (df["Weight Capacity (kg)"] <= 5),
        (df["Weight Capacity (kg)"]  > 5) & (df["Weight Capacity (kg)"] <= 15),
        (df["Weight Capacity (kg)"]  > 15) & (df["Weight Capacity (kg)"] <= 20),
        (df["Weight Capacity (kg)"]  > 20) & (df["Weight Capacity (kg)"] <= 25),
        (df["Weight Capacity (kg)"] > 25)]
    
    choices = ['Light', 'Middle', 'Light_heavy', 'Middel_heavy','Heavy']
    
    df['Weight_Class'] = np.select(conditions, choices, default='')
    df['Weight_Class'] = df['Weight_Class'].astype("category")
    
    return df


train = train.dropna(subset=['Weight Capacity (kg)'])
test['Weight Capacity (kg)'] = test['Weight Capacity (kg)'].fillna(test['Weight Capacity (kg)'].mean())
train_extra = train_extra.dropna(subset=['Weight Capacity (kg)'])

train = cat_category(train)
test = cat_category(test)
train_extra = cat_category(train_extra)

train = cat_weight(train)
test = cat_weight(test)
train_extra = cat_weight(train_extra)

train.drop(columns='id', inplace=True)
test.drop(columns='id', inplace=True)
train_extra.drop(columns='id', inplace=True)

train = pd.concat([train, train_extra], ignore_index=True)

X = train.drop(columns='Price')
y = train['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [11]:
params = {'random_seed': 42,
          'verbose': 0,
          'eval_metric': 'RMSE',
          'objective': 'RMSE',
          'cat_features': list(X.columns[X.dtypes == 'category'])}

In [11]:
# def objective(trial: optuna.Trial):
#     '''
#     Обычная сигнатура функции оптимизации для optuna.
#     '''
#     params = {
#         'random_seed': 42,
#         'verbose': 0,
#         'eval_metric': 'RMSE',
#         'iterations': trial.suggest_int('iterations', 500, 1500),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#         'depth': trial.suggest_int('depth', 3, 12),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
#         # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
#         # 'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True)
#     }
    
#     model = CatBoostRegressor(**params)
#     model.fit(X_train, y_train)
#     predictions = model.predict(X_test)
#     return root_mean_squared_error(y_test, predictions)
    
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

[I 2025-02-06 14:50:23,843] A new study created in memory with name: no-name-d7515b4d-14f4-4c1b-a189-dc23a008ed65
[I 2025-02-06 14:50:56,634] Trial 0 finished with value: 39.020659762258184 and parameters: {'iterations': 736, 'learning_rate': 0.0010675569384760778, 'depth': 8, 'l2_leaf_reg': 6.248694605124549e-05}. Best is trial 0 with value: 39.020659762258184.
[I 2025-02-06 14:51:34,486] Trial 1 finished with value: 39.010635997274676 and parameters: {'iterations': 702, 'learning_rate': 0.0029497693085367054, 'depth': 9, 'l2_leaf_reg': 0.00016419060638883182}. Best is trial 1 with value: 39.010635997274676.
[I 2025-02-06 14:54:49,956] Trial 2 finished with value: 39.3619555403973 and parameters: {'iterations': 1395, 'learning_rate': 0.01713722864978159, 'depth': 12, 'l2_leaf_reg': 4.840400262681113e-08}. Best is trial 1 with value: 39.010635997274676.
[I 2025-02-06 14:56:08,661] Trial 3 finished with value: 39.02015311481166 and parameters: {'iterations': 607, 'learning_rate': 0.0015

In [12]:
# study.best_params

{'iterations': 934,
 'learning_rate': 0.015921861697403002,
 'depth': 4,
 'l2_leaf_reg': 0.0029261943947529005}

In [13]:
params = {'random_seed': 42,
          'verbose': 0,
          'eval_metric': 'RMSE'}

study_params_0 = {'random_seed': 42,
          'verbose': 0,
          'eval_metric': 'RMSE',
          'iterations': 934,
          'learning_rate': 0.015921861697403002,
          'depth': 4,
          'l2_leaf_reg': 0.0029261943947529005}

In [12]:
cat_model = CatBoostRegressor(**params)
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
pred = cat_model.predict(test)
rmse

38.88671161250964

In [13]:
output = pd.DataFrame({'id': test_id, 'Price': pred})
output.to_csv('data/cat_model.csv', index=False)

In [40]:
output

Unnamed: 0,id,Price
0,300000,80.050680
1,300001,81.634332
2,300002,82.447633
3,300003,80.905903
4,300004,80.239963
...,...,...
199995,499995,79.744972
199996,499996,71.465772
199997,499997,82.130344
199998,499998,82.350369


# lgbm

In [3]:
train = train.dropna(subset=['Weight Capacity (kg)'])
test['Weight Capacity (kg)'] = test['Weight Capacity (kg)'].fillna(test['Weight Capacity (kg)'].mean())
train_extra = train_extra.dropna(subset=['Weight Capacity (kg)'])

def weight_split(df):
    df['Light'] = (df["Weight Capacity (kg)"] <= 5).astype(int)
    df['Middle'] = ((df["Weight Capacity (kg)"] > 5) & (train["Weight Capacity (kg)"] <= 15)).astype(int)
    df['Light_heavy'] = ((df["Weight Capacity (kg)"] > 15) & (train["Weight Capacity (kg)"] <= 20)).astype(int)
    df['Middle_heavy'] = ((df["Weight Capacity (kg)"] > 20) & (train["Weight Capacity (kg)"] <= 25)).astype(int)
    df['Heavy'] = (df["Weight Capacity (kg)"] > 25).astype(int)
    return df


train = weight_split(train)
test = weight_split(test)
train_extra = weight_split(train_extra)

class_features = ['Brand', 'Material', 'Size', 'Style', 'Color', 'Laptop Compartment', 'Waterproof']


def cat_category(df):
    df = pd.get_dummies(df, columns=class_features, dtype='int', dummy_na=True, drop_first=True)
    return df
    

train = cat_category(train)
test = cat_category(test)
train_extra = cat_category(train_extra)

train.drop(columns='id', inplace=True)
test.drop(columns='id', inplace=True)

train_extra.drop(columns='id', inplace=True)

train = pd.concat([train, train_extra], ignore_index=True)

X = train.drop(columns='Price')
y = train['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
def objective(trial: optuna.Trial):
    '''
    Обычная сигнатура функции оптимизации для optuna.
    '''
    param = {
            "metric": 'RMSE',
            "verbosity": -1,
            "boosting_type": "gbdt",
            "random_state": 42,
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0),
            "max_depth": trial.suggest_int("max_depth", 5, 25),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
            "subsample": trial.suggest_float("subsample", 0.8, 1.0, log=True),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0)}
    
    lgbm_model = LGBMRegressor(**param)
    lgbm_model.fit(X_train, y_train)
    y_pred = lgbm_model.predict(X_test)
    return root_mean_squared_error(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2025-02-10 11:36:51,568] A new study created in memory with name: no-name-87607daa-4c21-47ee-90b1-603ff6bcbcc1
[I 2025-02-10 11:36:58,069] Trial 0 finished with value: 39.007888207531444 and parameters: {'learning_rate': 0.013402766080797287, 'n_estimators': 493, 'lambda_l1': 1.232011048248966e-07, 'lambda_l2': 4.1690167725069076e-07, 'max_depth': 22, 'min_child_samples': 49, 'feature_fraction': 0.6724558028312183, 'bagging_fraction': 0.6124468825521593}. Best is trial 0 with value: 39.007888207531444.
[I 2025-02-10 11:37:03,068] Trial 1 finished with value: 39.00355943978649 and parameters: {'learning_rate': 0.010779073296606441, 'n_estimators': 414, 'lambda_l1': 6.747942111184449e-07, 'lambda_l2': 0.03458788070475433, 'max_depth': 10, 'min_child_samples': 16, 'feature_fraction': 0.44366951177929537, 'bagging_fraction': 0.5320694776951937}. Best is trial 1 with value: 39.00355943978649.
[I 2025-02-10 11:37:05,287] Trial 2 finished with value: 39.00754236125649 and parameters: {'lea

In [12]:
study.best_params

{'learning_rate': 0.014096540552923083,
 'n_estimators': 441,
 'lambda_l1': 0.4929997006283267,
 'lambda_l2': 3.536972671802839e-08,
 'max_depth': 13,
 'min_child_samples': 34,
 'feature_fraction': 0.4645858551273576,
 'bagging_fraction': 0.8389128874387093}

In [12]:
l_best_params_0 = {'learning_rate': 0.014096540552923083,
 'n_estimators': 441,
 'lambda_l1': 0.4929997006283267,
 'lambda_l2': 3.536972671802839e-08,
 'max_depth': 13,
 'min_child_samples': 34,
 'feature_fraction': 0.4645858551273576,
 'bagging_fraction': 0.8389128874387093}



In [6]:
lgbmr_model = LGBMRegressor()
lgbmr_model.fit(X_train, y_train)

y_pred = lgbmr_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
pred = lgbmr_model.predict(test)
rmse

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.151215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 3194008, number of used features: 32
[LightGBM] [Info] Start training from score 81.378630


38.913524636745976

In [5]:
output = pd.DataFrame({'id': test_id, 'Price': pred})
output.to_csv('data/lgbm_model.csv', index=False)

# xgbr

In [None]:
def objective_xg(trial):

    params = {
        "n_estimators": 100,
        "eval_metric": "rmse",
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.1),
        "min_child_weight": trial.suggest_int("min_child_weight", 0.01, 1),
        "subsample": trial.suggest_loguniform("subsample", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.1, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1)
    }

    # Initialize and fit the model
    model_xgb =  XGBRegressor(**params,
                             enable_categorical = True)
    
    model_xgb.fit(X_train, y_train)

    # Predict
    y_pred = model_xgb.predict(X_test)
    
    return mean_squared_error(y_test, y_pred, squared = False)

In [None]:
study_xgb = optuna.create_study(direction="minimize")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_xgb.optimize(objective_xg, n_trials=5, show_progress_bar=True)

In [7]:
xgbr_model = XGBRegressor()
xgbr_model.fit(X_train, y_train)

y_pred = xgbr_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
pred = xgbr_model.predict(test)
rmse

38.92211769615038

In [19]:
output = pd.DataFrame({'id': test_id, 'Price': pred})
output.to_csv('data/xgbr_model.csv', index=False)