In [None]:
!pip install catboost

[0m

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import zscore

from catboost import CatBoostRegressor

df = pd.read_csv("/content/train_backpack.csv")

categorical_features = ["Brand", "Material", "Size", "Laptop Compartment",
                        "Waterproof", "Style", "Color"]
numerical_features = ["Weight Capacity (kg)", "Compartments"]

for col in categorical_features:
    df[col].fillna(df[col].mode()[0], inplace=True)

for col in numerical_features:
    df[col].fillna(df[col].median(), inplace=True)

def feature_engineering(df):
    # Бинарные флаги
    df['Has_Laptop_Compartment'] = df['Laptop Compartment'].map({'Yes': 1, 'No': 0})
    df['Is_Waterproof'] = df['Waterproof'].map({'Yes': 1, 'No': 0})

    # Числовое представление категорий
    df['Size_Num'] = df['Size'].map({'Small': 1, 'Medium': 2, 'Large': 3})
    df['Material_Quality'] = df['Material'].map({
        'Leather': 3,
        'Canvas': 2,
        'Nylon': 2,
        'Polyester': 1
    })

    # Взаимодействия и производные признаки
    df['Quality_Capacity'] = df['Material_Quality'] * df['Weight Capacity (kg)']
    df['Size_Capacity'] = df['Size_Num'] * df['Weight Capacity (kg)']
    df['Capacity_Ratio'] = df['Weight Capacity (kg)'] / df['Compartments']
    df['Weight_Capacity_Ratio'] = df['Weight Capacity (kg)'] / df['Weight Capacity (kg)'].max()
    df['Weight_to_Compartments'] = df['Weight Capacity (kg)'] / (df['Compartments'] + 1)

    # Комбинированные признаки
    df['Brand_Material'] = df['Brand'].astype(str) + '_' + df['Material'].astype(str)
    df['Brand_Size'] = df['Brand'].astype(str) + '_' + df['Size'].astype(str)
    df['Style_Size'] = df['Style'] + '_' + df['Size']

    # Категоризация
    df['Compartments_Category'] = pd.cut(df['Compartments'],
        bins=[0, 2, 5, 10, np.inf], labels=['Few', 'Moderate', 'Many', 'Very Many'])

    return df

df = feature_engineering(df)

z_scores = np.abs(zscore(df[numerical_features]))
df = df[(z_scores < 3).all(axis=1)]

columns_to_encode = ['Brand', 'Material', 'Size', 'Laptop Compartment',
                     'Waterproof', 'Style', 'Color', 'Brand_Material',
                     'Brand_Size', 'Has_Laptop_Compartment', 'Is_Waterproof',
                     'Compartments_Category', 'Style_Size']

df_encoded = pd.get_dummies(df[columns_to_encode], columns=columns_to_encode, drop_first=True)

scaler = StandardScaler()
numerical_cols = ['Compartments', 'Weight Capacity (kg)',
                  'Weight_Capacity_Ratio', 'Weight_to_Compartments']
scaled_features = scaler.fit_transform(df[numerical_cols])

X = np.hstack([df_encoded, scaled_features])
y = df['Price']

final_categorical_features = ['Brand', 'Material', 'Size', 'Laptop Compartment',
                              'Waterproof', 'Style', 'Color', 'Brand_Material',
                              'Brand_Size', 'Compartments_Category', 'Style_Size']

final_numerical_features = ['Compartments', 'Weight Capacity (kg)',
                            'Capacity_Ratio', 'Size_Num',
                            'Material_Quality', 'Quality_Capacity',
                            'Size_Capacity', 'Weight_Capacity_Ratio',
                            'Weight_to_Compartments']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), final_numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), final_categorical_features)
    ])

X = preprocessor.fit_transform(df.drop('Price', axis=1))
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catboost_params = {
    "iterations": 300,
    "learning_rate": 0.1,
    "depth": 6,
    "verbose": 0,
    "random_seed": 42
}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [None]:
model = CatBoostRegressor(**catboost_params)
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=0)

# 9. Оценка модели
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# 10. Генерация предсказаний (если нужно)
# Для генерации предсказаний на тестовых данных:
# test_predictions = model.predict(X_test)


RMSE: 38.9199
MAE: 33.6471
