In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
import optuna
import xgboost
from xgboost import XGBRegressor
from xgboost import callback

In [2]:
pd.options.display.max_columns = 500

In [3]:
data = pd.read_csv(r"D:\datasets\playground_backpacks\training_extra.csv")

In [4]:
data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,500000,Under Armour,Canvas,Small,10.0,Yes,Yes,Tote,Blue,23.882052,114.11068
1,500001,Puma,Polyester,Small,4.0,No,Yes,Backpack,Green,11.869095,129.74972
2,500002,Jansport,Polyester,Small,8.0,Yes,Yes,Tote,Red,8.092302,21.37370
3,500003,Nike,Nylon,Large,7.0,No,No,Messenger,Pink,7.719581,48.09209
4,500004,Nike,Leather,Large,9.0,No,Yes,Tote,Green,22.741826,77.32461
...,...,...,...,...,...,...,...,...,...,...,...
3694313,4194313,Nike,Canvas,,3.0,Yes,Yes,Messenger,Blue,28.098120,104.74460
3694314,4194314,Puma,Leather,Small,10.0,Yes,Yes,Tote,Blue,17.379531,122.39043
3694315,4194315,Jansport,Canvas,Large,10.0,No,No,Backpack,Red,17.037708,148.18470
3694316,4194316,Puma,Canvas,,2.0,No,No,Backpack,Gray,28.783339,22.32269


In [5]:
class Imputer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        imputer1 = SimpleImputer(strategy='most_frequent')
        imputer2 = SimpleImputer(strategy='mean')

        categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 
                            'Waterproof', 'Style', 'Color']

        for col in categorical_cols:
            X[col] = imputer1.fit_transform(X[[col]]).ravel()

        X['Weight Capacity (kg)'] = imputer2.fit_transform(X[['Weight Capacity (kg)']]).ravel()

        return X


In [6]:
class FeatureEncoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        encoder = OneHotEncoder()
        
        matrix = encoder.fit_transform(X[['Brand']]).toarray()
        column_names = ['Under Armour', 'Puma', 'Jansport', 'Nike', 'Adidas', 'N']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        matrix = encoder.fit_transform(X[['Material']]).toarray()   
        column_names = ['Canvas', 'Polyester', 'Nylon', 'Leather', 'N']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        matrix = encoder.fit_transform(X[['Size']]).toarray()   
        column_names = ['Small', 'Large', 'Medium', 'N']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        matrix = encoder.fit_transform(X[['Material']]).toarray()   
        column_names = ['Canvas', 'Polyester', 'Nylon', 'Leather', 'N']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        matrix = encoder.fit_transform(X[['Style']]).toarray()   
        column_names = ['Tote', 'Backpack', 'Messenger', 'N']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        matrix = encoder.fit_transform(X[['Color']]).toarray()   
        column_names = ['Blue', 'Green', 'Red', 'Pink', 'Gray', 'Black', 'N']
        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        X['Laptop Compartment'] = X['Laptop Compartment'].apply(lambda x: 1 if x == 'Yes' else 0)
        X['Waterproof'] = X['Waterproof'].apply(lambda x: 1 if x == 'Yes' else 0)

        return X

In [7]:
class FeatureDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(['id', 'Brand', 'Material', 'Size', 'Style', 'Color', 'N'], axis=1, errors='ignore')

In [8]:
class Scaler(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        scaler = StandardScaler()
        if 'Price' in X.columns:
            scaled_data = scaler.fit_transform(X.drop('Price', axis=1))
            X_new = pd.DataFrame(scaled_data, columns=X.drop('Price', axis=1).columns)
            X_new['Price'] = X['Price'].values  # Восстанавливаем 'Price'
        else:
            scaled_data = scaler.fit_transform(X)  # Масштабируем весь тестовый набор
            X_new = pd.DataFrame(scaled_data, columns=X.columns)
        
        return X_new

In [9]:
pipeline = Pipeline([('imputer', Imputer()), 
                     ('featureencoder', FeatureEncoder()),
                     ('featuredropper', FeatureDropper()),
                     ('scaler', Scaler())])

In [10]:
final_data = pipeline.fit_transform(data)

In [11]:
final_data

Unnamed: 0,Compartments,Laptop Compartment,Waterproof,Weight Capacity (kg),Under Armour,Puma,Jansport,Nike,Adidas,Canvas,Polyester,Nylon,Leather,Small,Large,Medium,Tote,Backpack,Messenger,Blue,Green,Red,Pink,Gray,Black,Price
0,1.578161,0.963566,0.967377,0.842285,-0.499179,-0.480632,-0.486515,-0.483063,1.818035,1.849597,-0.568677,-0.555869,-0.644362,-0.699460,-0.751682,1.490351,-0.679933,-0.748195,1.441286,-0.429003,2.292911,-0.447262,-0.427412,-0.509023,-0.432873,114.11068
1,-0.495648,-1.037812,0.967377,-0.880505,-0.499179,-0.480632,-0.486515,2.070125,-0.550045,-0.540658,-0.568677,-0.555869,1.551922,-0.699460,-0.751682,1.490351,1.470734,-0.748195,-0.693825,-0.429003,-0.436127,-0.447262,2.339661,-0.509023,-0.432873,129.74972
2,0.886892,0.963566,0.967377,-1.422139,-0.499179,2.080595,-0.486515,-0.483063,-0.550045,-0.540658,-0.568677,-0.555869,1.551922,-0.699460,-0.751682,1.490351,-0.679933,-0.748195,1.441286,-0.429003,-0.436127,-0.447262,-0.427412,-0.509023,2.310147,21.37370
3,0.541257,-1.037812,-1.033724,-1.475592,-0.499179,-0.480632,2.055436,-0.483063,-0.550045,-0.540658,-0.568677,1.798985,-0.644362,1.429674,-0.751682,-0.670983,-0.679933,1.336549,-0.693825,-0.429003,-0.436127,-0.447262,-0.427412,1.964548,-0.432873,48.09209
4,1.232527,-1.037812,0.967377,0.678764,-0.499179,-0.480632,2.055436,-0.483063,-0.550045,-0.540658,1.758466,-0.555869,-0.644362,1.429674,-0.751682,-0.670983,-0.679933,-0.748195,1.441286,-0.429003,-0.436127,-0.447262,2.339661,-0.509023,-0.432873,77.32461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3694313,-0.841283,0.963566,0.967377,1.446916,-0.499179,-0.480632,2.055436,-0.483063,-0.550045,1.849597,-0.568677,-0.555869,-0.644362,-0.699460,1.330349,-0.670983,-0.679933,1.336549,-0.693825,-0.429003,2.292911,-0.447262,-0.427412,-0.509023,-0.432873,104.74460
3694314,1.578161,0.963566,0.967377,-0.090248,-0.499179,-0.480632,-0.486515,2.070125,-0.550045,-0.540658,1.758466,-0.555869,-0.644362,-0.699460,-0.751682,1.490351,-0.679933,-0.748195,1.441286,-0.429003,2.292911,-0.447262,-0.427412,-0.509023,-0.432873,122.39043
3694315,1.578161,-1.037812,-1.033724,-0.139269,-0.499179,2.080595,-0.486515,-0.483063,-0.550045,1.849597,-0.568677,-0.555869,-0.644362,1.429674,-0.751682,-0.670983,1.470734,-0.748195,-0.693825,-0.429003,-0.436127,-0.447262,-0.427412,-0.509023,2.310147,148.18470
3694316,-1.186918,-1.037812,-1.033724,1.545184,-0.499179,-0.480632,-0.486515,2.070125,-0.550045,1.849597,-0.568677,-0.555869,-0.644362,-0.699460,1.330349,-0.670983,1.470734,-0.748195,-0.693825,-0.429003,-0.436127,2.235828,-0.427412,-0.509023,-0.432873,22.32269


In [12]:
X, y = final_data.drop('Price', axis=1), final_data['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
def objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return root_mean_squared_error(y_test, preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=35, n_jobs=-1)

print("Лучшие параметры:", study.best_params)

[I 2025-04-19 16:40:51,815] A new study created in memory with name: no-name-1c09729b-550c-4de0-be2f-16cca5a104fb
[I 2025-04-19 16:46:42,501] Trial 11 finished with value: 38.89940440690334 and parameters: {'learning_rate': 0.16064655580304651, 'n_estimators': 186, 'max_depth': 4, 'subsample': 0.8781285459760952, 'colsample_bytree': 0.7006538222620904}. Best is trial 11 with value: 38.89940440690334.
[I 2025-04-19 16:47:44,463] Trial 12 finished with value: 38.90862538920975 and parameters: {'learning_rate': 0.014187781829060575, 'n_estimators': 220, 'max_depth': 4, 'subsample': 0.8271681922288466, 'colsample_bytree': 0.5237432475233217}. Best is trial 11 with value: 38.89940440690334.
[I 2025-04-19 16:48:24,330] Trial 14 finished with value: 38.907221585255606 and parameters: {'learning_rate': 0.015571449962585703, 'n_estimators': 240, 'max_depth': 4, 'subsample': 0.718307190959953, 'colsample_bytree': 0.5050942006523427}. Best is trial 11 with value: 38.89940440690334.
[I 2025-04-19 

Лучшие параметры: {'learning_rate': 0.07480836142299725, 'n_estimators': 508, 'max_depth': 3, 'subsample': 0.9457861920057444, 'colsample_bytree': 0.5335853817649265}


In [14]:
model = XGBRegressor(**study.best_params)
model.fit(X_train, y_train)

In [15]:
predicts = model.predict(X_test)

In [16]:
root_mean_squared_error(predicts, y_test)

38.898689320789764

In [17]:
test_csv = pd.read_csv(r"D:\datasets\playground_backpacks\test.csv")

In [18]:
final_test_csv = pipeline.fit_transform(test_csv)

In [19]:
y_true = model.predict(final_test_csv)

In [20]:
submission = pd.DataFrame()
submission['id'] = test_csv['id']
submission['Price'] = y_true

In [21]:
submission

Unnamed: 0,id,Price
0,300000,81.758430
1,300001,82.287971
2,300002,83.952766
3,300003,82.341667
4,300004,78.002434
...,...,...
199995,499995,80.288658
199996,499996,79.280121
199997,499997,84.683830
199998,499998,80.526962


In [None]:
submission.to_csv('submission.csv', index=False)