In [None]:
import numpy as np
import pandas as pd
from fastcore.all import *
from fastai.tabular.all import *

import holidays
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
train_df = pd.read_csv('course_sales.csv')
test_df  = pd.read_csv('test.csv')

In [None]:
train_df.head(3)

In [None]:
test_df.head(3)

In [None]:
countries = train_df.country.unique()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.nunique()

In [None]:
test_id = test_df['id']
targ = 'num_sold'

In [None]:
years_list = [2017, 2018, 2019, 2020, 2021, 2022]

In [None]:
def Predproduction(data):
    data = data.drop('id', axis=1)
    data['holiday'] = True
    for country in countries:
        country_holidays = holidays.CountryHoliday(country, years=years_list)
        data.loc[data['country'] == country, 'holiday'] = data.loc[data['country'] == country,
                                                                   'date'].apply(lambda x: x in country_holidays)
    data = add_datepart(data, 'date')
    data = data.drop(['Is_month_end', 'Is_month_start', 'Is_quarter_end',
                      'Is_quarter_start', 'Is_year_end', 'Is_year_start'], axis=1)
    return data

In [None]:
train_df = Predproduction(train_df)
test_df  = Predproduction(test_df)

In [None]:
cond = (train_df['Year']<2021)
train_idx = np.where( cond)[0]
valid_idx = np.where(~cond)[0]

splits = (list(train_idx),list(valid_idx))

In [None]:
procs=[Categorify]

In [None]:
con_cols, cat_cols = cont_cat_split(train_df, max_card=5, dep_var=targ)
con_cols, cat_cols

In [None]:
train_tp = TabularPandas(train_df, procs, cat_names=cat_cols, cont_names=con_cols, y_names=targ)
test_tp = TabularPandas(test_df, procs, cat_names=cat_cols, cont_names=con_cols)

In [None]:
xs=train_tp.train.xs
y=train_tp.train.y
tst_xs=test_tp.train.xs

In [None]:
def smape(A, F):
# Вычисление метрики sMAPE
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [None]:
def eval_model(model, x, y, trn_idx=None, val_idx=None):
# Обучение модели и расчет метрики sMAPE
    model.fit(x.iloc[trn_idx], y.iloc[trn_idx])
    pred = model.predict(x.iloc[val_idx])
    metric = smape(pred, y.iloc[val_idx])
    return metric, pred

In [None]:
def weighted_sum(preds, scores):
# Расчет взвешенной суммы предсказаний для нескольких моделей
    w_sum = 0
    sum_preds = np.zeros_like(preds[0])
    for pred, score in zip(preds, scores):
        w_c = score/np.sum(scores)
        sum_preds+=np.multiply(pred,w_c)
    np.divide(sum_preds, len(preds))
    return sum_preds

In [None]:
gb=GradientBoostingRegressor(n_estimators=500, max_depth=6, min_samples_leaf=6, max_features=0.7)
rf=RandomForestRegressor(n_estimators=180, min_samples_leaf=5, max_features=0.5)
xgb = XGBRegressor(n_estimators=80, max_depth=7, subsample=0.7, learning_rate=0.3)
cat = CatBoostRegressor(silent=True, learning_rate=0.052, iterations=815, depth=12, objective='MAE')
lgbm = LGBMRegressor(learning_rate=0.13, min_child_samples=46, n_estimators=300, num_leaves=80,
                     objective='MAE')
models = [rf, gb, xgb, cat, lgbm]
scores = []
preds = []
for model in models:
    score, pred = eval_model(model, xs, y, train_idx, valid_idx)
    print(model, 'smape:', score)
    scores.append(1/score)
    preds.append(pred)
f_preds = weighted_sum(preds, scores)
smape(f_preds, y.iloc[valid_idx])

In [None]:
gb=GradientBoostingRegressor(n_estimators=500, max_depth=6, min_samples_leaf=6, max_features=0.7)
rf=RandomForestRegressor(n_estimators=180, min_samples_leaf=5, max_features=0.6)
xgb = XGBRegressor(n_estimators=80, max_depth=7, subsample=0.7, learning_rate=0.3)
cat = CatBoostRegressor(silent=True, learning_rate=0.052, iterations=815, depth=12, objective='MAE')
lgbm = LGBMRegressor(learning_rate=0.13, min_child_samples=46, n_estimators=300, num_leaves=80,
                     objective='MAE')
models = [rf, gb, xgb, cat, lgbm]
tst_preds = []
for model in models:
    model.fit(xs, y)
    tst_preds.append(model.predict(tst_xs))
final_preds = weighted_sum(tst_preds, scores)

In [None]:
result_df = pd.read_csv('sample_submission.csv')

In [None]:
temp_df = tst_xs.copy()
temp_df['num_sold'] = final_preds

In [None]:
mean_by_country = temp_df.groupby('country')[['num_sold']].mean()
sum_mean_country = mean_by_country.sum()
mean_country_df = 1 / ((mean_by_country/sum_mean_country) / 0.29)
mean_country_df

In [None]:
def same_mean(df):
# Выравнивание результатов по среднему (в связи с особенностями формирования данных текущего соревнования)
    df['num_sold'] = df.apply(lambda x: 
        x['num_sold'] * mean_country_df.loc[1] if x['country'] == 1 else(
        x['num_sold'] * mean_country_df.loc[2] if x['country'] == 2 else(
        x['num_sold'] * mean_country_df.loc[3] if x['country'] == 3 else(
        x['num_sold'] * mean_country_df.loc[4] if x['country'] == 4 else(
        x['num_sold'] * mean_country_df.loc[5] if x['country'] == 5 else(
        ))))), axis=1)

In [None]:
same_mean(temp_df)

In [None]:
temp_df['num_sold'] = np.round(temp_df['num_sold'],0)

In [None]:
result_df['num_sold'] = temp_df['num_sold']
result_df

In [None]:
result_df.to_csv('subm_ens.csv', index = False)