In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import lightgbm
from sklearn.metrics import mean_squared_error
import warnings
import gc


warnings.filterwarnings('ignore')
np.random.seed(4590)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [29]:
sales_train = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/sales_train.csv')

test = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/test.csv')

item_categories = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/item_categories.csv')

items = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/items.csv')

shops = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/shops.csv')

In [30]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,dayofweek,dayofmonth,month,quarter,year,minute,hour,total_sales,shop_name,item_name,item_category_id,item_category_name
0,2013-01-02,0,59,22154,999.0,1.0,2,2,1,1,2013,0,0,999.0,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,2013-01-03,0,25,2552,899.0,1.0,3,3,1,1,2013,0,0,899.0,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,2013-01-05,0,25,2552,899.0,-1.0,5,5,1,1,2013,0,0,-899.0,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
3,2013-01-06,0,25,2554,1709.05,1.0,6,6,1,1,2013,0,0,1709.05,"Москва ТРК ""Атриум""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
4,2013-01-15,0,25,2555,1099.0,1.0,1,15,1,1,2013,0,0,1099.0,"Москва ТРК ""Атриум""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства


In [31]:
sales_train['total_sales'] = sales_train.item_price * sales_train.item_cnt_day

In [32]:
sales_train = sales_train.drop(["date"], axis =1)

In [34]:
sales_train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,dayofweek,dayofmonth,month,quarter,year,minute,hour,total_sales,shop_name,item_name,item_category_id,item_category_name
0,0,59,22154,999.0,1.0,2,2,1,1,2013,0,0,999.0,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,0,25,2552,899.0,1.0,3,3,1,1,2013,0,0,899.0,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,0,25,2552,899.0,-1.0,5,5,1,1,2013,0,0,-899.0,"Москва ТРК ""Атриум""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
3,0,25,2554,1709.05,1.0,6,6,1,1,2013,0,0,1709.05,"Москва ТРК ""Атриум""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
4,0,25,2555,1099.0,1.0,1,15,1,1,2013,0,0,1099.0,"Москва ТРК ""Атриум""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства


In [36]:
train = pd.DataFrame(sales_train.groupby(['date_block_num', 'item_id', 'shop_id'])\
                     .agg({"item_cnt_day": "sum", "total_sales": "sum"}))
train.reset_index(inplace=True)
#train = train.rename(columns={"item_cnt_day": "item_cnt_month"})

In [37]:
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,total_sales
0,0,19,25,1.0,28.0
1,0,27,1,1.0,1890.0
2,0,27,2,1.0,2499.0
3,0,27,10,1.0,1890.0
4,0,27,19,1.0,2499.0


In [38]:
train = pd.merge(train, items, on=['item_id'], how='left')

In [39]:
train = pd.merge(train, shops, on=['shop_id'], how='left')

In [40]:
train.head()

Unnamed: 0,date_block_num,item_id,shop_id,item_cnt_day,total_sales,item_name,item_category_id,shop_name
0,0,19,25,1.0,28.0,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,"Москва ТРК ""Атриум"""
1,0,27,1,1.0,1890.0,"007 Legends [PS3, русская версия]",19,"!Якутск ТЦ ""Центральный"" фран"
2,0,27,2,1.0,2499.0,"007 Legends [PS3, русская версия]",19,"Адыгея ТЦ ""Мега"""
3,0,27,10,1.0,1890.0,"007 Legends [PS3, русская версия]",19,Жуковский ул. Чкалова 39м?
4,0,27,19,1.0,2499.0,"007 Legends [PS3, русская версия]",19,"Курск ТЦ ""Пушкинский"""


In [44]:
dates = pd.DataFrame(sales_train.groupby(['date_block_num'])[['month', 'year', 'quarter']].max())
dates.reset_index(inplace=True)

In [46]:
train = pd.merge(train, dates, on=['date_block_num'], how='left')

In [48]:
train.to_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/train.csv', index = False)

In [48]:
#item_price can chage with in a month
sales_tarin.groupby(['item_id', 'date_block_num'])['item_price'].count()

item_id  date_block_num
0        20                  1
1        15                  2
         18                  1
         19                  1
         20                  1
         21                  1
2        19                  1
         22                  1
3        18                  1
         19                  1
4        20                  1
5        23                  1
6        18                  1
7        23                  1
8        19                  1
         20                  1
9        19                  1
10       24                  1
11       22                  1
12       1                   1
13       20                  1
14       23                  1
15       22                  1
16       20                  1
17       20                  1
18       19                  1
19       0                   1
20       19                  1
21       20                  1
22       27                  1
                          ... 
22167    9     

In [3]:
train = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/train.csv')

In [6]:
item_categories = pd.read_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/item_categories.csv')

In [7]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [11]:
train = pd.merge(train, item_categories, on = ['item_category_id'], how="left")

In [13]:
train.to_csv('/Users/sinsakuokazaki/Project/pred-future-sales/\
input/competitive-data-science-predict-future-sales/train.csv', index = False)