# 0.0 imports

In [74]:
import pandas as pd
import inflection
import numpy as np

# 1.0 load dataset

In [42]:
df_raw = pd.read_csv('../data/df_ready.csv')

# 2.0 data description

In [57]:
df1 = df_raw.copy()

In [58]:
cols_selected = ['Date_imp_d', 'Category_name', 'name', 'price', 'merchant', 'brand', 'manufacturer','Day_n', 'month', 'month_n', 'day', 'Week_Number']
df1 = df1[cols_selected]

## 2.1 rename columns

In [59]:
cols_old = ['Date_imp_d', 'Category_name', 'name', 'price', 'merchant', 'brand', 'manufacturer','Day_n', 'month', 'month_n', 'day', 'Week_Number']

snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))

# rename
df1.columns = cols_new

In [60]:
df1.head(1)

Unnamed: 0,date_imp_d,category_name,name,price,merchant,brand,manufacturer,day_n,month,month_n,day,week_number
0,2017/12/14,"speaker, portable, bluetooth",Boytone - 2500W 2.1-Ch. Home Theater System - ...,69.0,Walmart.com,Boytone,Boytone,Thursday,12,December,14,50


## 2.2 data dimension

In [61]:
print('number of rows: {}'.format(df1.shape[0]))
print('number of columns: {}'.format(df1.shape[1]))

number of rows: 23151
number of columns: 12


## 2.3 data types

In [65]:
df1.dtypes

date_imp_d       datetime64[ns]
category_name            object
name                     object
price                   float64
merchant                 object
brand                    object
manufacturer             object
day_n                    object
month                     int64
month_n                  object
day                       int64
week_number               int64
dtype: object

## 2.4 check NA

In [63]:
df1.isna().sum()

date_imp_d           0
category_name        0
name                 0
price                0
merchant             0
brand                0
manufacturer     10639
day_n                0
month                0
month_n              0
day                  0
week_number          0
dtype: int64

## 2.5 change types

In [64]:
df1['date_imp_d'] = pd.to_datetime(df1['date_imp_d'])

## 2.6 descriptive statistics

In [70]:
num_attributes = df1.select_dtypes(include=['float64', 'int64'])
cat_attributes = df1.select_dtypes(exclude=['float64', 'int64', 'datetime64[ns]'])

### 2.6.1 numerical attributes

In [80]:
#central tendency
ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

#dispersion
d1 = pd.DataFrame(num_attributes.apply(max)).T
d2 = pd.DataFrame(num_attributes.apply(min)).T
d3 = pd.DataFrame(num_attributes.apply(np.std)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

m = pd.concat([d2, d1, d4, ct1, ct2, d3, d5, d6]).T.reset_index()
m.columns = ['att', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,att,min,max,range,mean,median,std,skew,kurtosis
0,price,1.0,10879.95,10878.95,513.037803,199.99,859.091448,4.59516,34.411146
1,month,1.0,12.0,11.0,7.65103,8.0,2.592787,-0.099417,-0.628859
2,day,1.0,31.0,30.0,15.693879,16.0,9.681413,0.032258,-1.331838
3,week_number,1.0,51.0,50.0,31.34275,32.0,11.316253,-0.26698,-0.648027


### 2.6.2 categorical attributes

In [83]:
cat_attributes.apply(lambda x: x.unique().shape[0])

category_name     58
name             908
merchant           7
brand            266
manufacturer     236
day_n              7
month_n           11
dtype: int64