# Capstone Two: Data Wrangling

## Data Collection
Data loading and data joining. 

In [18]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

##### File descriptions
sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.

test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.

sample_submission.csv - a sample submission file in the correct format.

items.csv - supplemental information about the items/products.

item_categories.csv  - supplemental information about the items categories.

shops.csv- supplemental information about the shops.

In [2]:
# get all the csv files from the data folder
data_path = './data/'
sales_train = pd.read_csv(data_path+"sales_train.csv")
test_dataset = pd.read_csv(data_path+"test.csv")
items = pd.read_csv(data_path+"items.csv")
item_categories = pd.read_csv(data_path+"item_categories.csv")
shops = pd.read_csv(data_path+"shops.csv")
#Let's make sure they all loaded
[loaded.columns for loaded in [sales_train, test_dataset, items, item_categories, shops]]

[Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
        'item_cnt_day'],
       dtype='object'),
 Index(['ID', 'shop_id', 'item_id'], dtype='object'),
 Index(['item_name', 'item_id', 'item_category_id'], dtype='object'),
 Index(['item_category_name', 'item_category_id'], dtype='object'),
 Index(['shop_name', 'shop_id'], dtype='object')]

In [19]:
# Let's look at the sales_train dataset and items dataset to make sure we can merge as is.
print(sales_train.shape)
print(sales_train.head())

(2935849, 6)
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0


In [17]:
# Are there any missing data? 
sales_train.isna().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [4]:
# Looks like we can join the data together using shop id and item id. 
# Let's left join on the training set, to only add the columns that are relavent to our training set. 
sales_train_item = pd.merge(sales_train, items, on="item_id")
print(sales_train_item.columns)
sales_train_item.head()

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'item_name', 'item_category_id'],
      dtype='object')


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37
1,23.01.2013,0,24,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37
2,20.01.2013,0,27,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37
3,02.01.2013,0,25,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37
4,03.01.2013,0,25,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37


In [11]:
# Let's see what the item categories look like
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [12]:
# This could be very helpful later on - lets merge. 
sales_train_item_categories = pd.merge(sales_train_item, item_categories, on="item_category_id")
sales_train_item_categories.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,23.01.2013,0,24,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
2,20.01.2013,0,27,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
3,02.01.2013,0,25,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
4,03.01.2013,0,25,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray


In [13]:
# Let's see what the shops look like
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [None]:
# New lets do thew same with our test dataset.

## Data Organization

## Data Definition

## Data Cleaning

In [7]:
#Let's take a look at our data and see the format for the columns
print("Sales training data\n",sales_train.head())
print("Test dataset\n",test_dataset.head())

Sales training data
          date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0
Test dataset
    ID  shop_id  item_id
0   0        5     5037
1   1        5     5320
2   2        5     5233
3   3        5     5232
4   4        5     5268


In [8]:
# The next step to make sure we don't have large amounts of missing values
print(sales_train.isna().sum())
print(sales_train[sales_train.item_price == 0].count())
# Great, so we don't have missing values and we don't have 0 for the item price. 
# Do we have negative numbers? 
print((sales_train.item_price.values <= 0).any())
print((sales_train.item_cnt_day.values <= 0).any())

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64
date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64
True
True


In [9]:
# Let's explore those negative numbers a bit more. 
print(sales_train[sales_train['item_price'] <= 0])
#And see how many times this item was sold
negative_ids = sales_train[sales_train['item_price'] <= 0].item_id
print(sales_train[sales_train.item_id.isin(negative_ids)].describe())

              date  date_block_num  shop_id  item_id  item_price  item_cnt_day
484683  15.05.2013               4       32     2973        -1.0           1.0
       date_block_num     shop_id  item_id   item_price  item_cnt_day
count      780.000000  780.000000    780.0   780.000000    780.000000
mean         3.401282   31.160256   2973.0  2041.627277      1.383333
std          4.062319   16.051145      0.0   584.281629      1.171102
min          0.000000    0.000000   2973.0    -1.000000     -1.000000
25%          0.000000   19.000000   2973.0  1249.500000      1.000000
50%          1.000000   30.000000   2973.0  2499.000000      1.000000
75%          6.000000   45.000000   2973.0  2499.000000      1.000000
max         22.000000   59.000000   2973.0  2499.000000     13.000000
