# Purpose
Predict total sales for every product and store in the next month

In [7]:
import pandas as pd
import pyarrow as pa

# Data lookup & preparation

## item_categories.csv
Supplemental information about the items categories

In [8]:
# item_category_name - name of item category
# item_category_id - unique identifier of item category
item_categories = pd.read_csv('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/Raw data preparation/item_categories.csv')
item_categories

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4
...,...,...
79,Служебные,79
80,Служебные - Билеты,80
81,Чистые носители (шпиль),81
82,Чистые носители (штучные),82


### Info

In [9]:
item_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   item_category_name  84 non-null     object
 1   item_category_id    84 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [10]:
item_categories['item_category_id'].max()

83

### Type conversion + info

In [11]:
item_categories['item_category_id'] = item_categories['item_category_id'].astype('int16')
item_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   item_category_name  84 non-null     object
 1   item_category_id    84 non-null     int16 
dtypes: int16(1), object(1)
memory usage: 968.0+ bytes


### Saving to .feather format

In [12]:
item_categories.to_feather('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/EDA & feather data/item_categories.feather')

## items.csv
Supplemental information about the items/products

In [13]:
# item_name - name of item
# item_id - unique identifier of a product
# item_category_id - unique identifier of item category
items = pd.read_csv('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/Raw data preparation/items.csv')
items

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40
...,...,...,...
22165,"Ядерный титбит 2 [PC, Цифровая версия]",22165,31
22166,Язык запросов 1С:Предприятия [Цифровая версия],22166,54
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,22167,49
22168,Яйцо для Little Inu,22168,62


### Info

In [14]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


### Type conversion + info

In [15]:
items['item_category_id'] = items['item_category_id'].astype('int16')
items['item_id'] = items['item_id'].astype('int16')
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int16 
 2   item_category_id  22170 non-null  int16 
dtypes: int16(2), object(1)
memory usage: 259.9+ KB


### Saving to .feather format

In [16]:
items.to_feather('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/EDA & feather data/items.feather')

## sales_train.csv
The training set. Daily historical data from January 2013 to October 2015

In [17]:
# date - date in format dd/mm/yyyy
# date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
# shop_id - unique identifier of a shop
# item_id - unique identifier of a product
# item_price - current price of an item
# item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
sales_train = pd.read_csv('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/Raw data preparation/sales_train.csv')
sales_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,10.10.2015,33,25,7409,299.00,1.0
2935845,09.10.2015,33,25,7460,299.00,1.0
2935846,14.10.2015,33,25,7459,349.00,1.0
2935847,22.10.2015,33,25,7440,299.00,1.0


### Info

In [18]:
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


### Describe

In [19]:
sales_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


### Type conversion + info

In [20]:
sales_train['date_block_num'] = sales_train['date_block_num'].astype('int16')
sales_train['shop_id'] = sales_train['shop_id'].astype('int16')
sales_train['item_id'] = sales_train['item_id'].astype('int16')
sales_train['item_price'] = sales_train['item_price'].astype('float32')
sales_train['item_cnt_day'] = sales_train['item_cnt_day'].astype('float32')
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int16  
 2   shop_id         int16  
 3   item_id         int16  
 4   item_price      float32
 5   item_cnt_day    float32
dtypes: float32(2), int16(3), object(1)
memory usage: 61.6+ MB


### Saving to .feather format

In [21]:
sales_train.to_feather('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/EDA & feather data/sales_train.feather')

## sample_submission.csv
A sample submission file in the correct format

In [22]:
# ID - an Id that represents a (Shop, Item) tuple within the test set
# item_cnt_month - ???
sample_submission = pd.read_csv('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/Raw data preparation/sample_submission.csv')
sample_submission

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
...,...,...
214195,214195,0.5
214196,214196,0.5
214197,214197,0.5
214198,214198,0.5


### Info

In [23]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              214200 non-null  int64  
 1   item_cnt_month  214200 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 3.3 MB


### Describe

In [24]:
sample_submission.describe()

Unnamed: 0,ID,item_cnt_month
count,214200.0,214200.0
mean,107099.5,0.5
std,61834.358168,0.0
min,0.0,0.5
25%,53549.75,0.5
50%,107099.5,0.5
75%,160649.25,0.5
max,214199.0,0.5


### Type conversion + info

In [25]:
sample_submission['ID'] = sample_submission['ID'].astype('int32')
sample_submission['item_cnt_month'] = sample_submission['item_cnt_month'].astype('float32')
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              214200 non-null  int32  
 1   item_cnt_month  214200 non-null  float32
dtypes: float32(1), int32(1)
memory usage: 1.6 MB


### Saving to .feather format

In [26]:
sample_submission.to_feather('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/EDA & feather data/sample_submission.feather')

## shops.csv
Supplemental information about the shops

In [27]:
# shop_name - name of shop
# shop_id - unique identifier of a shop
shops = pd.read_csv('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/Raw data preparation/shops.csv')
shops

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


In [28]:
shops['shop_id'].max()

59

### Type conversion + info

In [29]:
shops['shop_id'] = shops['shop_id'].astype('int16')
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   shop_name  60 non-null     object
 1   shop_id    60 non-null     int16 
dtypes: int16(1), object(1)
memory usage: 728.0+ bytes


### Saving to .feather format

In [30]:
shops.to_feather('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/EDA & feather data/shops.feather')

## test.csv
The test set. You need to forecast the sales for these shops and products for November 2015

In [31]:
# ID - an Id that represents a (Shop, Item) tuple within the test set
# shop_id - unique identifier of a shop
# item_id - unique identifier of a product
test = pd.read_csv('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/Raw data preparation/test.csv')
test

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268
...,...,...,...
214195,214195,45,18454
214196,214196,45,16188
214197,214197,45,15757
214198,214198,45,19648


### Info

In [32]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   ID       214200 non-null  int64
 1   shop_id  214200 non-null  int64
 2   item_id  214200 non-null  int64
dtypes: int64(3)
memory usage: 4.9 MB


### Describe

In [33]:
test.describe()

Unnamed: 0,ID,shop_id,item_id
count,214200.0,214200.0,214200.0
mean,107099.5,31.642857,11019.398627
std,61834.358168,17.561933,6252.64459
min,0.0,2.0,30.0
25%,53549.75,16.0,5381.5
50%,107099.5,34.5,11203.0
75%,160649.25,47.0,16071.5
max,214199.0,59.0,22167.0


### Type conversion + info

In [34]:
test['ID'] = test['ID'].astype('int32')
test['shop_id'] = test['shop_id'].astype('int16')
test['item_id'] = test['item_id'].astype('int16')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   ID       214200 non-null  int32
 1   shop_id  214200 non-null  int16
 2   item_id  214200 non-null  int16
dtypes: int16(2), int32(1)
memory usage: 1.6 MB


### Saving to .feather format

In [35]:
test.to_feather('C:/Users/nowic/Desktop/Home tasks/Pets/Future Sales/EDA & feather data/test.feather')