<h3>Packages</h3>

In [1]:
# ---------------------------- IMPORTS -------------------------------:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# --------------------------- FROM ----------------------------------:
from xgboost import XGBClassifier,XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict, cross_validate

# -------------------------- obs -----------------------------------:
%matplotlib inline

<h3>Data</h3>

In [2]:
sales = pd.read_csv('sales_train.csv')
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [3]:
items = pd.read_csv('items.csv')
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [4]:
items_categories = pd.read_csv('item_categories.csv')
items_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [5]:
shops = pd.read_csv('shops.csv')
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [6]:
sample = pd.read_csv('sample_submission.csv')
sample.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [7]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


<h3>Preprocessing</h3>

In [8]:
sales = pd.read_csv('sales_train.csv')
sales['cnt'] = 1
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,cnt
0,02.01.2013,0,59,22154,999.0,1.0,1
1,03.01.2013,0,25,2552,899.0,1.0,1
2,05.01.2013,0,25,2552,899.0,-1.0,1
3,06.01.2013,0,25,2554,1709.05,1.0,1
4,15.01.2013,0,25,2555,1099.0,1.0,1


In [9]:
sales = pd.merge(sales, items, on='item_id')
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,cnt,item_name,item_category_id
0,02.01.2013,0,59,22154,999.0,1.0,1,ЯВЛЕНИЕ 2012 (BD),37
1,23.01.2013,0,24,22154,999.0,1.0,1,ЯВЛЕНИЕ 2012 (BD),37
2,20.01.2013,0,27,22154,999.0,1.0,1,ЯВЛЕНИЕ 2012 (BD),37
3,02.01.2013,0,25,22154,999.0,1.0,1,ЯВЛЕНИЕ 2012 (BD),37
4,03.01.2013,0,25,22154,999.0,1.0,1,ЯВЛЕНИЕ 2012 (BD),37


<h3>Model</h3>

In [10]:
# X = sales[['shop_id', 'item_id', 'item_price']]
X = sales[['item_price','cnt','item_category_id','shop_id']]
y = sales['item_cnt_day']

In [11]:
#Fazendo split dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
# Avaliando as métricas de erros e o cross-validation:
# Selecionando o Modelo:
model = XGBClassifier()
model.fit(X_train, y_train)

averages = {}
cv = 2
metricas = ['r2','accuracy','f1']
ans = cross_validate(model, X_train, y_train, return_train_score=True, scoring=metricas, cv=cv)
for key_dict in ans.keys():
    averages[key_dict] = round(np.mean(ans[key_dict]),4)

print('========= Model Evaluation ===========')
print('')
for keys,values in averages.items():
    print(keys," = ", str(values))