In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import gc
%matplotlib inline

In [2]:
train = pd.read_csv("../input/sales_train.csv")
items = pd.read_csv("../input/items.csv")
shops = pd.read_csv("../input/shops.csv")
cats = pd.read_csv("../input/item_categories.csv")
test = pd.read_csv("../input/test.csv").set_index('ID')

In [3]:
train.head(2)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0


In [4]:
items.head(7)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40
5,***НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ (UNI) ...,5,40
6,***УДАР ПО ВОРОТАМ (UNI) D,6,40


In [5]:
shops.head(7)

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6


In [6]:
cats.head(7)

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4
5,Аксессуары - PSVita,5
6,Аксессуары - XBOX 360,6


In [7]:
test.head(2)

Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320


In [8]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]
shops.head()

Unnamed: 0,shop_id,city_code
0,0,0
1,1,0
2,2,1
3,3,2
4,4,3


In [9]:
cats['type'] = cats['item_category_name'].str.split('-').map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['item_category_name'].str.split('-').map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

In [10]:
cats.head()

Unnamed: 0,item_category_id,type_code,subtype_code
0,0,0,29
1,1,1,9
2,2,1,10
3,3,1,11
4,4,1,13


In [11]:
train[train['item_price'] < 0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
484683,15.05.2013,4,32,2973,-1.0,1.0


In [12]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

In [13]:
train['date'] = pd.to_datetime(train['date'])
train['month'] = train['date'].apply(lambda time: time.month)
train['year']  = train['date'].apply(lambda time: time.year)

In [14]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [15]:
test['date_block_num'] = 34
test['month'] = 11
test['year'] = 2015

In [16]:
total = pd.concat([train.drop('date',axis=1), test], ignore_index=True, sort=False)
total.fillna(0,inplace=True)
total.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,month,year,revenue
0,0,59,22154,999.0,1.0,2,2013,999.0
1,0,25,2552,899.0,1.0,3,2013,899.0
2,0,25,2552,899.0,-1.0,5,2013,-899.0
3,0,25,2554,1709.05,1.0,6,2013,1709.05
4,0,25,2555,1099.0,1.0,1,2013,1099.0


In [17]:
group = total.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']

total = pd.merge(total, group, on=['date_block_num','shop_id','item_id'], how='left')
total['item_cnt_month'] = total['item_cnt_month'].fillna(0)

total.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,month,year,revenue,item_cnt_month
0,0,59,22154,999.0,1.0,2,2013,999.0,1.0
1,0,25,2552,899.0,1.0,3,2013,899.0,0.0
2,0,25,2552,899.0,-1.0,5,2013,-899.0,0.0
3,0,25,2554,1709.05,1.0,6,2013,1709.05,1.0
4,0,25,2555,1099.0,1.0,1,2013,1099.0,1.0


In [18]:
items.drop('item_name',axis=1,inplace=True)
total = pd.merge(total, items, on='item_id', how='left')

total = pd.merge(total, cats, on='item_category_id', how='left')

total = pd.merge(total, shops, on='shop_id', how='left')
total.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,month,year,revenue,item_cnt_month,item_category_id,type_code,subtype_code,city_code
0,0,59,22154,999.0,1.0,2,2013,999.0,1.0,37,11,1,31
1,0,25,2552,899.0,1.0,3,2013,899.0,0.0,58,13,27,14
2,0,25,2552,899.0,-1.0,5,2013,-899.0,0.0,58,13,27,14
3,0,25,2554,1709.05,1.0,6,2013,1709.05,1.0,58,13,27,14
4,0,25,2555,1099.0,1.0,1,2013,1099.0,1.0,56,13,3,14


In [19]:
def lag(lags, col):
    
    for i in lags:
        
        total[col+'_lag_'+str(i)] = total[col].shift(i)
        
    total.fillna(0, inplace=True)

In [20]:
lag([1,2,3,6,12], 'item_cnt_month')

In [21]:
group = total.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_avg_item_cnt']

total = pd.merge(total, group, on=['date_block_num'], how='left')
total['date_avg_item_cnt'] = total['date_avg_item_cnt'].fillna(0)

lag([1,2,3,6,12], 'date_avg_item_cnt')

In [22]:
group = total.groupby(['date_block_num','item_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_avg_item_cnt']

total = pd.merge(total, group, on=['date_block_num','item_id'], how='left')
total['date_item_avg_item_cnt'] = total['date_item_avg_item_cnt'].fillna(0)

lag([1,2,3,6,12], 'date_item_avg_item_cnt')

In [23]:
group = total.groupby(['date_block_num','shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_avg_item_cnt']

total = pd.merge(total, group, on=['date_block_num','shop_id'], how='left')
total['date_shop_avg_item_cnt'] = total['date_shop_avg_item_cnt'].fillna(0)

lag([1,2,3,6,12], 'date_shop_avg_item_cnt')

In [24]:
group = total.groupby(['date_block_num','shop_id','item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_item_cnt']

total = pd.merge(total, group, on=['date_block_num','shop_id','item_category_id'], how='left')
total['date_shop_cat_avg_item_cnt'] = total['date_shop_cat_avg_item_cnt'].fillna(0)

lag([1], 'date_shop_avg_item_cnt')

In [25]:
group = total.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_cat_avg_item_cnt']

total = pd.merge(total, group, on=['date_block_num','item_category_id'], how='left')
total['date_cat_avg_item_cnt'] = total['date_cat_avg_item_cnt'].fillna(0)

lag([1], 'date_shop_avg_item_cnt')

In [26]:
group = total.groupby(['date_block_num','city_code','item_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_city_item_avg_item_cnt']

total = pd.merge(total, group, on=['date_block_num','city_code','item_id'], how='left')
total['date_city_item_avg_item_cnt'] = total['date_city_item_avg_item_cnt'].fillna(0)

lag([1], 'date_shop_avg_item_cnt')

In [27]:
#total= total[total['date_block_num'] > 11 ]
#total.head(5)

In [28]:
X_train = total[total.date_block_num < 33].drop(['item_cnt_month'], axis=1)
y_train = total[total.date_block_num < 33]['item_cnt_month']
X_valid = total[total.date_block_num == 33].drop(['item_cnt_month'], axis=1)
y_valid = total[total.date_block_num == 33]['item_cnt_month']
X_test = total[total.date_block_num == 34].drop(['item_cnt_month'], axis=1)


In [29]:
del train
del items
del shops
del cats 
del group
del total
gc.collect()

309

In [30]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	validation_0-rmse:27.7888	validation_1-rmse:45.9931
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:25.1934	validation_1-rmse:44.1411
[2]	validation_0-rmse:22.8466	validation_1-rmse:42.5932
[3]	validation_0-rmse:20.8065	validation_1-rmse:40.7403
[4]	validation_0-rmse:18.9256	validation_1-rmse:39.6901
[5]	validation_0-rmse:17.2376	validation_1-rmse:38.7719
[6]	validation_0-rmse:15.7355	validation_1-rmse:37.9418
[7]	validation_0-rmse:14.3595	validation_1-rmse:36.7106
[8]	validation_0-rmse:13.1694	validation_1-rmse:36.1572
[9]	validation_0-rmse:12.1204	validation_1-rmse:35.717
[10]	validation_0-rmse:11.191	validation_1-rmse:35.3501
[11]	validation_0-rmse:10.3738	validation_1-rmse:34.9948
[12]	validation_0-rmse:9.65609	validation_1-rmse:34.6951
[13]	validation_0-rmse:9.03927	validation_1-rmse:34.488
[14]	validation_0-rmse:8.4543	validation_1-rmse:34.0116


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eta=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=300, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)

In [31]:
y_pred = model.predict(X_valid).clip(0, 20)
y_test = model.predict(X_test).clip(0, 20)

In [32]:
submission = pd.DataFrame({"ID":test.index, "item_cnt_month":y_test})
submission.to_csv('submission.csv',index=False)