# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# File descriptions
* **sales_train.csv** - the training set. Daily historical data from January 2013 to October 2015.
* **test.csv** - the test set. You need to forecast the sales for these shops and products for November 2015.
* **sample_submission.csv** - a sample submission file in the correct format.
* **items.csv** - supplemental information about the items/products.
* **item_categories.csv**  - supplemental information about the items categories.
* **shops.csv**- supplemental information about the shops.

# Data fields
* **ID** - an Id that represents a (Shop, Item) tuple within the test set
* **shop_id** - unique identifier of a shop
* **item_id** - unique identifier of a product
* **item_category_id** - unique identifier of item category
* **item_cnt_day** - number of products sold. You are predicting a monthly amount of this measure
* **item_price** - current price of an item
* **date** - date in format dd/mm/yyyy
* **date_block_num** - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* **item_name** - name of item
* **shop_name** - name of shop
* **item_category_name** - name of item category

# Reading Data Set

In [4]:
sales_train = pd.read_csv("sales_train.csv")
items = pd.read_csv("items.csv")
item_categories = pd.read_csv("item_categories.csv") 
shops = pd.read_csv("shops.csv")

# Getting basic insight of data frames

In [6]:
print(" sales_train -> ", sales_train.shape, "\n",
     "items -> ", items.shape, "\n",
     "item_categories -> ", item_categories.shape, "\n",
     "shops -> ", shops.shape)

 sales_train ->  (2935849, 6) 
 items ->  (22170, 3) 
 item_categories ->  (84, 2) 
 shops ->  (60, 2)


### ----------------------------------------->>     sales_train  <<------------------------------------------------------###

In [23]:
sales_train.columns, len(sales_train.columns)

(Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
        'item_cnt_day'],
       dtype='object'), 6)

In [15]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [19]:
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [10]:
sales_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


### ----------------------------------------->>     items    <<------------------------------------------------------###

In [25]:
items.columns, len(items.columns)

(Index(['item_name', 'item_id', 'item_category_id'], dtype='object'), 3)

In [26]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [27]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
item_name           22170 non-null object
item_id             22170 non-null int64
item_category_id    22170 non-null int64
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


In [28]:
items.describe()

Unnamed: 0,item_id,item_category_id
count,22170.0,22170.0
mean,11084.5,46.290753
std,6400.07207,15.941486
min,0.0,0.0
25%,5542.25,37.0
50%,11084.5,40.0
75%,16626.75,58.0
max,22169.0,83.0


### ----------------------------------------->>     item_categories    <<------------------------------------------------------###

In [30]:
item_categories.columns, len(item_categories.columns)

(Index(['item_category_name', 'item_category_id'], dtype='object'), 2)

In [31]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [32]:
item_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
item_category_name    84 non-null object
item_category_id      84 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [33]:
item_categories.describe()

Unnamed: 0,item_category_id
count,84.0
mean,41.5
std,24.392622
min,0.0
25%,20.75
50%,41.5
75%,62.25
max,83.0


### ----------------------------------------->>     shops    <<------------------------------------------------------###

In [34]:
shops.columns, len(shops.columns)

(Index(['shop_name', 'shop_id'], dtype='object'), 2)

In [35]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [36]:
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
shop_name    60 non-null object
shop_id      60 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.0+ KB


In [37]:
shops.describe()

Unnamed: 0,shop_id
count,60.0
mean,29.5
std,17.464249
min,0.0
25%,14.75
50%,29.5
75%,44.25
max,59.0


### Concatinating dataframes

In [40]:
sales_train.merge(items, on="item_id")

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id
0,02.01.2013,0,59,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
1,23.01.2013,0,24,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
2,20.01.2013,0,27,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
3,02.01.2013,0,25,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
4,03.01.2013,0,25,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
5,20.01.2013,0,25,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
6,23.01.2013,0,25,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
7,26.01.2013,0,25,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
8,27.01.2013,0,6,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
9,10.01.2013,0,15,22154,999.00,1.0,ЯВЛЕНИЕ 2012 (BD),37
