In [1]:
import numpy as np
import pandas as pd

In [2]:
df_itm_cat = pd.read_csv('00-Input/item_categories.csv')
df_itm = pd.read_csv('00-Input/items.csv')
df_sh = pd.read_csv('00-Input/shops.csv')
df_tr = pd.read_csv('00-Input/sales_train.csv')
df_te = pd.read_csv('00-Input/test.csv')

In [3]:
# Cleaning shops data

df_tr.loc[df_tr.shop_id == 0, 'shop_id'] = 57
df_te.loc[df_te.shop_id == 0, 'shop_id'] = 57

df_tr.loc[df_tr.shop_id == 1, 'shop_id'] = 58
df_te.loc[df_te.shop_id == 1, 'shop_id'] = 58

df_tr.loc[df_tr.shop_id == 10, 'shop_id'] = 11
df_te.loc[df_te.shop_id == 10, 'shop_id'] = 11

In [4]:
# Filter pandas DataFrame by substring critera
df_sh[df_sh['shop_name'].str.contains('! ', regex=False)]

Unnamed: 0,city,shop_name,shop_id
0,Yakutsk,"! Yakutsk Ordzhonikidze, 56 Franc",0
1,Yakutsk,"! Yakutsk TC ""Central"" Franc",1


In [5]:
# getting rid of "!" before shop_names
df_sh['shop_name'] = df_sh['shop_name'].map(lambda x: x.split('!')[1] if x.startswith('!') else x)

In [6]:
df_tr['date'] = pd.to_datetime(df_tr['date'],format = '%d.%m.%Y')
df_tr

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.00,1.0
1,2013-01-03,0,25,2552,899.00,1.0
2,2013-01-05,0,25,2552,899.00,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.00,1.0
2935845,2015-10-09,33,25,7460,299.00,1.0
2935846,2015-10-14,33,25,7459,349.00,1.0
2935847,2015-10-22,33,25,7440,299.00,1.0


In [7]:
df_tr.dtypes

date              datetime64[ns]
date_block_num             int64
shop_id                    int64
item_id                    int64
item_price               float64
item_cnt_day             float64
dtype: object

In [8]:
df_tr.rename({'item_cnt_day': 'item_cnt_month'}, axis=1, inplace=True)

In [9]:
# now we will create a pivot tabel by going so we get our data in desired form 
# we want get total count value of an item over the whole month for a shop 
# That why we made shop_id and item_id our indices and date_block_num our column 
# the value we want is item_cnt_day and used sum as aggregating function

dataset = df_tr.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_month'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')
dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month
Unnamed: 0_level_1,date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
shop_id,item_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,27,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,31,0,4,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,32,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,33,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,22154,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,22155,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,22162,0,0,0,0,0,0,0,0,0,0,...,0,9,4,1,1,0,0,1,0,0
59,22164,0,0,0,0,0,0,0,0,0,0,...,0,2,1,2,0,0,1,0,0,0


In [10]:
# lets reset our indices, so that data should be in way we can easily manipulate

dataset.reset_index(inplace = True)

In [11]:
# Now we will merge our pivot table with the test_data because we want to keep the data of items we have
# predict

dataset = pd.merge(df_te,dataset,on = ['item_id','shop_id'],how = 'left')
dataset

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,ID,shop_id,item_id,"(item_cnt_month, 0)","(item_cnt_month, 1)","(item_cnt_month, 2)","(item_cnt_month, 3)","(item_cnt_month, 4)","(item_cnt_month, 5)","(item_cnt_month, 6)",...,"(item_cnt_month, 24)","(item_cnt_month, 25)","(item_cnt_month, 26)","(item_cnt_month, 27)","(item_cnt_month, 28)","(item_cnt_month, 29)","(item_cnt_month, 30)","(item_cnt_month, 31)","(item_cnt_month, 32)","(item_cnt_month, 33)"
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,,,,,,,,...,,,,,,,,,,
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,214195,45,18454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
214196,214196,45,16188,,,,,,,,...,,,,,,,,,,
214197,214197,45,15757,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,214198,45,19648,,,,,,,,...,,,,,,,,,,


In [12]:
# lets fill all NaN values with 0
dataset.fillna(0,inplace = True)
# lets check our data now 
dataset.head()

Unnamed: 0,ID,shop_id,item_id,"(item_cnt_month, 0)","(item_cnt_month, 1)","(item_cnt_month, 2)","(item_cnt_month, 3)","(item_cnt_month, 4)","(item_cnt_month, 5)","(item_cnt_month, 6)",...,"(item_cnt_month, 24)","(item_cnt_month, 25)","(item_cnt_month, 26)","(item_cnt_month, 27)","(item_cnt_month, 28)","(item_cnt_month, 29)","(item_cnt_month, 30)","(item_cnt_month, 31)","(item_cnt_month, 32)","(item_cnt_month, 33)"
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# we will drop shop_id and item_id because we do not need them
# we are teaching our model how to generate the next sequence

dataset.drop(['shop_id','item_id','ID'],inplace = True, axis = 1)
dataset.head()

Unnamed: 0,"(item_cnt_month, 0)","(item_cnt_month, 1)","(item_cnt_month, 2)","(item_cnt_month, 3)","(item_cnt_month, 4)","(item_cnt_month, 5)","(item_cnt_month, 6)","(item_cnt_month, 7)","(item_cnt_month, 8)","(item_cnt_month, 9)",...,"(item_cnt_month, 24)","(item_cnt_month, 25)","(item_cnt_month, 26)","(item_cnt_month, 27)","(item_cnt_month, 28)","(item_cnt_month, 29)","(item_cnt_month, 30)","(item_cnt_month, 31)","(item_cnt_month, 32)","(item_cnt_month, 33)"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# X we will keep all columns execpt the last one 
X_train = np.expand_dims(dataset.values[:,:-1],axis = 2)
# the last column is our label
y_train = dataset.values[:,-1:]

# for test we keep all the columns execpt the first one
X_test = np.expand_dims(dataset.values[:,1:],axis = 2)

# lets have a look on the shape 
print(X_train.shape,y_train.shape,X_test.shape)

(214200, 33, 1) (214200, 1) (214200, 33, 1)


## XGB

In [15]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing


In [None]:


tf.keras.backend.clear_session()

lstm_model = tf.keras.Sequential([
    tf.keras.layers.Reshape(input_shape=(32,), target_shape=(32, 1,)),
    tf.keras.layers.LSTM(units=32, input_shape=(32, 1)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1)
])

lstm_model.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.Adam(0.1),
    metrics=['mse']
)

lstm_model.summary()