In [1]:
import numpy as np
import pandas as pd
import holidays
import optuna

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold 
from sklearn.metrics import mean_absolute_percentage_error, make_scorer

In [2]:
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')

In [3]:
# Привожу дату к типу pd.datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df_pred['date'] = pd.to_datetime(df_pred['date'], format='%Y-%m-%d')

In [4]:
# Новые признаки из даты
df['day'] = df.date.dt.day
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
df['quarter'] = df.date.dt.quarter
df['dayofyear'] = df.date.dt.dayofyear
df['weekday'] = df.date.dt.weekday

df_pred['day'] = df_pred.date.dt.day
df_pred['month'] = df_pred.date.dt.month
df_pred['year'] = df_pred.date.dt.year
df_pred['quarter'] = df_pred.date.dt.quarter
df_pred['dayofyear'] = df_pred.date.dt.dayofyear
df_pred['weekday'] = df_pred.date.dt.weekday

In [5]:
# holiday_dict = {
#     'Canada': holidays.CA(),
#     'Finland': holidays.FI(),
#     'Italy': holidays.IT(),
#     'Kenya': holidays.KE(),
#     'Norway': holidays.NO(),
#     'Singapore': holidays.SG()
# }


# def is_holiday(row):
#     country = row['country']
#     date = row['date']
    
#     if date in holiday_dict[country]:
#         return True
#     else:
#         return False


# def is_weekend(row):
#     date = row['date']

#     if date.weekday() >= 5:
#         return True
#     else:
#         return False


# df['weekend'] = df.apply(is_weekend, axis=1)
# df['holiday'] = df.apply(is_holiday, axis=1)

# df_pred['weekend'] = df_pred.apply(is_weekend, axis=1)
# df_pred['holiday'] = df_pred.apply(is_holiday, axis=1)

In [5]:
# Удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

# Удаляю Nan значения
df.dropna(subset=['num_sold'], inplace=True)

In [6]:
# удаляю date
df.drop(columns='date', inplace=True)
df_pred.drop(columns='date', inplace=True)

In [7]:
def cat_val(df, column):
    uniq = df[column].unique()
    for i in uniq:
        df[i] = df[column].apply(lambda x: x == i)
    df.drop(columns=column, inplace=True)


for i in ['country', 'store', 'product']:
    cat_val(df, i)
    cat_val(df_pred, i)

In [8]:
X = df.drop(columns='num_sold')
y = df['num_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [9]:
# Синхронизация столбцов
df_pred = df_pred[X.columns]

In [10]:
def objective(trial: optuna.Trial):
    '''
    Обычная сигнатура функции оптимизации для optuna.
    '''
    param = {
        'random_state': 42,
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0)}

    
    xgbr_model = XGBRegressor(**param)
    xgbr_model.fit(X_train, y_train)
    y_pred = xgbr_model.predict(X_test)
    return mean_absolute_percentage_error(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2025-01-20 16:49:53,900] A new study created in memory with name: no-name-45baba9a-747a-418b-b7de-5c37c7718e30
[I 2025-01-20 16:49:56,966] Trial 0 finished with value: 2.9766951060952263 and parameters: {'max_depth': 9, 'learning_rate': 0.0988959793970302, 'n_estimators': 257, 'min_child_weight': 8, 'gamma': 0.7254990115973348, 'subsample': 0.6570276828649558, 'colsample_bytree': 0.045203911673730085, 'reg_alpha': 0.222228087148357, 'reg_lambda': 0.14127799330740717}. Best is trial 0 with value: 2.9766951060952263.
[I 2025-01-20 16:50:03,691] Trial 1 finished with value: 0.5231144377609577 and parameters: {'max_depth': 3, 'learning_rate': 0.11015859279429664, 'n_estimators': 593, 'min_child_weight': 9, 'gamma': 0.614826275671072, 'subsample': 0.6306756378561864, 'colsample_bytree': 0.7483577947202201, 'reg_alpha': 0.5596096138041716, 'reg_lambda': 0.7271570565903668}. Best is trial 1 with value: 0.5231144377609577.
[I 2025-01-20 16:50:12,074] Trial 2 finished with value: 0.416454112

In [11]:
study.best_params

{'max_depth': 14,
 'learning_rate': 0.01324387343234451,
 'n_estimators': 553,
 'min_child_weight': 9,
 'gamma': 0.4049072989340031,
 'subsample': 0.9726559002239704,
 'colsample_bytree': 0.9935645291463002,
 'reg_alpha': 0.028750306006211025,
 'reg_lambda': 0.594248339112557}

In [12]:
best_params = {'max_depth': 14,
 'learning_rate': 0.01324387343234451,
 'n_estimators': 553,
 'min_child_weight': 9,
 'gamma': 0.4049072989340031,
 'subsample': 0.9726559002239704,
 'colsample_bytree': 0.9935645291463002,
 'reg_alpha': 0.028750306006211025,
 'reg_lambda': 0.594248339112557}

In [16]:
xgbr_model = XGBRegressor(**best_params)
xgbr_model.fit(X_train, y_train)

y_pred = xgbr_model.predict(X_test)
mae = mean_absolute_percentage_error(y_test, y_pred)
pred = xgbr_model.predict(df_pred)
mae

0.04705864164718761

In [13]:
xgbr_model = XGBRegressor(**best_params)
xgbr_model.fit(X, y)

y_pred = xgbr_model.predict(X_test)
mae = mean_absolute_percentage_error(y_test, y_pred)
pred = xgbr_model.predict(df_pred)
mae

0.038554750050876746

In [14]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'num_sold': pred})
output.to_csv('data/xgbr_model.csv', index=False)

In [15]:
X.columns

Index(['day', 'month', 'year', 'quarter', 'dayofyear', 'weekday', 'Canada',
       'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore', 'Discount Stickers',
       'Stickers for Less', 'Premium Sticker Mart', 'Kaggle', 'Kaggle Tiers',
       'Kerneler', 'Kerneler Dark Mode', 'Holographic Goose'],
      dtype='object')