In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

path = 'data'

train = pd.read_csv(path + '/sales_train.csv.gz')
test = pd.read_csv(path + '/test.csv.gz').set_index('ID')
items = pd.read_csv(path + '/items.csv')
items_cat = pd.read_csv(path + '/item_categories.csv')
shops = pd.read_csv(path + '/shops.csv')
geo = pd.read_csv(path + '/geo_shop.csv')

In [71]:
# Considerando só as lojas que estão no teste.
test_shops = list(set(test['shop_id'].unique()))
mask = train['shop_id'].isin(test_shops)
train = train[mask]

In [72]:
test.head()

Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [73]:
test['date_block_num'] = 34

In [74]:
train = pd.concat([train, test], ignore_index=True, sort=False)

In [75]:
train.fillna(0, inplace=True)
train.tail()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2627441,0,34,45,18454,0.0,0.0
2627442,0,34,45,16188,0.0,0.0
2627443,0,34,45,15757,0.0,0.0
2627444,0,34,45,19648,0.0,0.0
2627445,0,34,45,969,0.0,0.0


In [76]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = pd.merge(shops, geo, left_on='city', right_on='name').fillna(0)
shops.drop(['name'], axis=1, inplace=True)
items_cat['split'] = items_cat['item_category_name'].str.split('-')
items_cat['type'] = items_cat['split'].map(lambda x: x[0].strip())
items_cat['type_code'] = LabelEncoder().fit_transform(items_cat['type'])
items_cat['subtype'] = items_cat['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
items_cat['subtype_code'] = LabelEncoder().fit_transform(items_cat['subtype'])
train['is_year'] = 0
train['is_year'][(train.date_block_num + 1) % 12 == 0]  = 1
train['is_semester'] = 0
train['is_semester'][(train.date_block_num + 1) % 6 == 0]  = 1
train['revenue'] = train['item_price'] * train['item_cnt_day']
col = ['date_block_num', 'item_price', 'item_cnt_day', 'revenue', 'shop_id']
group_index = ['date_block_num', 'shop_id']
aux = train[col].groupby(group_index).mean().reset_index()
aux.rename(columns={'item_cnt_day':'item_cnt-(mean_shop)',
                    'revenue':'revenue-(mean_shop)', 'item_price':'item_price-(mean_shop)'}, inplace=True)
train = pd.merge(train, aux, on=group_index, how='inner')
col = ['date_block_num', 'item_cnt_day', 'revenue', 'item_id']
group_index = ['date_block_num', 'item_id']
aux = train[col].groupby(group_index).mean()
aux.rename(columns={'item_cnt_day':'item_cnt-(mean_item)',
                    'revenue':'revenue-(mean_item)'}, inplace=True)
train = pd.merge(train, aux, left_on=group_index, right_index=True, how='inner')
del aux
train = train[train.item_price < 100000]
train = train[train.item_cnt_day < 1001]
train.drop(484683, axis=0, inplace=True)
train.drop(['date'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [77]:
df_clean = pd.merge(train, shops, on=['shop_id'], how='left')
df_clean = pd.merge(df_clean, items, on=['item_id'], how='left')
df_clean = pd.merge(df_clean, items_cat, on=['item_category_id'], how='left')

In [78]:
df_clean.drop(['shop_name', 'item_name', 'city',
         'item_category_name', 'split',
         'type', 'subtype', 'name_PT-BR'], axis=1, inplace=True)

In [82]:
df_clean['date_block_num'] = df_clean['date_block_num'].astype(np.int8)
df_clean['shop_id'] = df_clean['shop_id'].astype(np.int8)
df_clean['item_id'] = df_clean['item_id'].astype(np.int16)
df_clean['item_price'] = df_clean['item_price'].astype(np.float32)
df_clean['item_cnt_day'] = df_clean['item_cnt_day'].astype(np.int16)
df_clean['is_year'] = df_clean['is_year'].astype(np.int8)
df_clean['is_semester'] = df_clean['is_semester'].astype(np.int8)
df_clean['revenue'] = df_clean['revenue'].astype(np.float32)
df_clean['item_price-(mean_shop)'] = df_clean['item_price-(mean_shop)'].astype(np.float32)
df_clean['item_cnt-(mean_shop)'] = df_clean['item_cnt-(mean_shop)'].astype(np.float32)
df_clean['revenue-(mean_shop)'] = df_clean['revenue-(mean_shop)'].astype(np.float32)
df_clean['item_cnt-(mean_item)'] = df_clean['item_cnt-(mean_item)'].astype(np.float32)
df_clean['revenue-(mean_item)'] = df_clean['revenue-(mean_item)'].astype(np.float32)
df_clean['city_code'] = df_clean['city_code'].astype(np.int16)
df_clean['latitude'] = df_clean['latitude'].astype(np.float32)
df_clean['longitude'] = df_clean['longitude'].astype(np.float32)
df_clean['item_category_id'] = df_clean['item_category_id'].astype(np.int16)
df_clean['type_code'] = df_clean['type_code'].astype(np.int16)
df_clean['subtype_code'] = df_clean['subtype_code'].astype(np.int16)


In [84]:
data = df_clean

In [85]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

KeyError: "['item_cnt_month'] not found in axis"

ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.
