In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

In [3]:
df

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...,...
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [4]:
df_pred

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode
...,...,...,...,...,...
98545,328675,2019-12-31,Singapore,Premium Sticker Mart,Holographic Goose
98546,328676,2019-12-31,Singapore,Premium Sticker Mart,Kaggle
98547,328677,2019-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers
98548,328678,2019-12-31,Singapore,Premium Sticker Mart,Kerneler


## Знакомство с данными
- *num_sold* - 8871 nan значений (целевой признак)
- *country* - 'Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore'
- *store* - 'Discount Stickers', 'Stickers for Less', 'Premium Sticker Mart'
- *product* - 'Holographic Goose', 'Kaggle', 'Kaggle Tiers', 'Kerneler', 'Kerneler Dark Mode'
- *date* - будем думать

In [5]:
# Удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

In [6]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df_pred['date'] = pd.to_datetime(df['date'])
df_pred['year'] = df_pred['date'].dt.year
df_pred['month'] = df_pred['date'].dt.month
df_pred['day'] = df_pred['date'].dt.day


# удаляю date
df.drop(columns='date', inplace=True)
df_pred.drop(columns='date', inplace=True)

In [7]:
lst_features = ['country', 'store', 'product', 'year', 'month', 'day']

In [8]:
df.dropna(subset=['num_sold'], inplace=True)

In [9]:
df

Unnamed: 0,country,store,product,num_sold,year,month,day
1,Canada,Discount Stickers,Kaggle,973.0,2010,1,1
2,Canada,Discount Stickers,Kaggle Tiers,906.0,2010,1,1
3,Canada,Discount Stickers,Kerneler,423.0,2010,1,1
4,Canada,Discount Stickers,Kerneler Dark Mode,491.0,2010,1,1
5,Canada,Stickers for Less,Holographic Goose,300.0,2010,1,1
...,...,...,...,...,...,...,...
230125,Singapore,Premium Sticker Mart,Holographic Goose,466.0,2016,12,31
230126,Singapore,Premium Sticker Mart,Kaggle,2907.0,2016,12,31
230127,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0,2016,12,31
230128,Singapore,Premium Sticker Mart,Kerneler,1242.0,2016,12,31


In [10]:
X = df.drop(columns='num_sold')
y = df['num_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
cat_model = CatBoostRegressor(verbose=0, eval_metric='MAPE', cat_features=lst_features, random_seed=42)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = cat_model.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
pred = cat_model.predict(df_pred)
mape

0.16592267695366864

In [12]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'num_sold': pred})
output.to_csv('data/cat_model.csv', index=False)

array([ 149.51143247,  717.68899866,  592.82169977, ..., 1585.99520778,
        870.84721258, 1029.3465368 ])