In [1]:
import numpy as np
import pandas as pd
import holidays
import optuna

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold 
from sklearn.metrics import mean_absolute_percentage_error, make_scorer

In [2]:
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')

In [3]:
# Привожу дату к типу pd.datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df_pred['date'] = pd.to_datetime(df_pred['date'], format='%Y-%m-%d')

In [4]:
# Новые признаки из даты
df['day'] = df.date.dt.day
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year - 2009
df['quarter'] = df.date.dt.quarter
df['dayofyear'] = df.date.dt.dayofyear
df['weekday'] = df.date.dt.weekday

df_pred['day'] = df_pred.date.dt.day
df_pred['month'] = df_pred.date.dt.month
df_pred['year'] = df_pred.date.dt.year - 2009
df_pred['quarter'] = df_pred.date.dt.quarter
df_pred['dayofyear'] = df_pred.date.dt.dayofyear
df_pred['weekday'] = df_pred.date.dt.weekday

In [5]:
df

Unnamed: 0,id,date,country,store,product,num_sold,day,month,year,quarter,dayofyear,weekday
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,,1,1,1,1,1,4
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0,1,1,1,1,1,4
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0,1,1,1,1,1,4
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0,1,1,1,1,1,4
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0,1,1,1,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0,31,12,7,4,366,5
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0,31,12,7,4,366,5
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0,31,12,7,4,366,5
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0,31,12,7,4,366,5


In [6]:
# holiday_dict = {
#     'Canada': holidays.CA(),
#     'Finland': holidays.FI(),
#     'Italy': holidays.IT(),
#     'Kenya': holidays.KE(),
#     'Norway': holidays.NO(),
#     'Singapore': holidays.SG()
# }


# def is_holiday(row):
#     country = row['country']
#     date = row['date']
    
#     if date in holiday_dict[country]:
#         return True
#     else:
#         return False


# def is_weekend(row):
#     date = row['date']

#     if date.weekday() >= 5:
#         return True
#     else:
#         return False


# df['weekend'] = df.apply(is_weekend, axis=1)
# df['holiday'] = df.apply(is_holiday, axis=1)

# df_pred['weekend'] = df_pred.apply(is_weekend, axis=1)
# df_pred['holiday'] = df_pred.apply(is_holiday, axis=1)

In [7]:
# Удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

# Удаляю Nan значения
df.dropna(subset=['num_sold'], inplace=True)

In [8]:
# удаляю date
df.drop(columns='date', inplace=True)
df_pred.drop(columns='date', inplace=True)

In [9]:
lst = ['country', 'store', 'product']
df = pd.get_dummies(df, columns=lst, dtype='bool')
df_pred = pd.get_dummies(df_pred, columns=lst, dtype='bool')

In [10]:
X = df.drop(columns='num_sold')
y = df['num_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [11]:
# Синхронизация столбцов
df_pred = df_pred[X.columns]

In [12]:
# def objective(trial: optuna.Trial):
#     '''
#     Обычная сигнатура функции оптимизации для optuna.
#     '''
#     param = {
#         'random_state': 42,
#         'max_depth': trial.suggest_int('max_depth', 1, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#         'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
#         'subsample': trial.suggest_float('subsample', 0.01, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0)}

    
#     xgbr_model = XGBRegressor(**param)
#     xgbr_model.fit(X_train, y_train)
#     y_pred = xgbr_model.predict(X_test)
#     return mean_absolute_percentage_error(y_test, y_pred)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)

In [13]:
best_params = {'max_depth': 14,
 'learning_rate': 0.01324387343234451,
 'n_estimators': 553,
 'min_child_weight': 9,
 'gamma': 0.4049072989340031,
 'subsample': 0.9726559002239704,
 'colsample_bytree': 0.9935645291463002,
 'reg_alpha': 0.028750306006211025,
 'reg_lambda': 0.594248339112557}

In [15]:
xgbr_model = XGBRegressor(**best_params)
xgbr_model.fit(X_train, y_train)

y_pred = xgbr_model.predict(X_test)
mae = mean_absolute_percentage_error(y_test, y_pred)
pred = xgbr_model.predict(df_pred)
mae

0.046949941152707134

In [16]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'num_sold': pred})
output.to_csv('data/xgbr_model.csv', index=False)