# 0.0 Importd

In [28]:
import pandas as pd
import inflection
import numpy as np

## 0.1 Loading Data

In [12]:
df_raw = pd.read_csv('../data/df_ready.csv')

In [13]:
df_raw = df_raw.drop(columns={  'Unnamed: 0', 'Date_imp', 'Cluster', 'condition', 'sourceURLs', 'Date_imp_d.1', 
                                'Zscore_1', 'price_std', 'imageURLs', 'shipping', 'weight', 'currency'})

In [14]:
df_raw.head(1)

Unnamed: 0,Date_imp_d,Category_name,name,price,disc_price,merchant,Disc_percentage,isSale,Imp_count,brand,p_description,dateAdded,dateSeen,dateUpdated,manufacturer,Day_n,month,month_n,day,Week_Number
0,2017/12/14,"speaker, portable, bluetooth",Boytone - 2500W 2.1-Ch. Home Theater System - ...,69.0,64.99,Walmart.com,0.06,Yes,1,Boytone,"Stereos,Portable Bluetooth Speakers,TV, Video ...",2015-05-18T14:14:56Z,2017-12-14T06:00:00Z,2018-06-13T19:39:02Z,Boytone,Thursday,12,December,14,50


# 1.0 Descrição dos Dados

In [15]:
df1 = df_raw.copy()

## 1.1 Rename Columns

In [16]:
df1.columns

Index(['Date_imp_d', 'Category_name', 'name', 'price', 'disc_price',
       'merchant', 'Disc_percentage', 'isSale', 'Imp_count', 'brand',
       'p_description', 'dateAdded', 'dateSeen', 'dateUpdated', 'manufacturer',
       'Day_n', 'month', 'month_n', 'day', 'Week_Number'],
      dtype='object')

In [17]:
cols_old = ['Date_imp_d', 'Category_name', 'name', 'price', 'disc_price', 'merchant', 'Disc_percentage', 'isSale', 'Imp_count', 
            'brand', 'p_description', 'dateAdded', 'dateSeen', 'dateUpdated', 'manufacturer', 'Day_n', 'month', 'month_n', 'day', 
            'Week_Number']

snakecase = lambda x: inflection.underscore( x )

cols_new = list( map (snakecase, cols_old) )

#Rename
df1.columns = cols_new

In [18]:
df1.columns

Index(['date_imp_d', 'category_name', 'name', 'price', 'disc_price',
       'merchant', 'disc_percentage', 'is_sale', 'imp_count', 'brand',
       'p_description', 'date_added', 'date_seen', 'date_updated',
       'manufacturer', 'day_n', 'month', 'month_n', 'day', 'week_number'],
      dtype='object')

## 1.2. Data Dimensions

In [19]:
print(f'Numero de Linhas: {df1.shape[0]}')
print(f'Numero de Colunas: {df1.shape[1]}')

Numero de Linhas: 23151
Numero de Colunas: 20


## 1.3. Data Types

In [20]:
df1.dtypes

date_imp_d          object
category_name       object
name                object
price              float64
disc_price         float64
merchant            object
disc_percentage    float64
is_sale             object
imp_count            int64
brand               object
p_description       object
date_added          object
date_seen           object
date_updated        object
manufacturer        object
day_n               object
month                int64
month_n             object
day                  int64
week_number          int64
dtype: object

## 1.4. Check NA

In [21]:
df1.isna().sum()

date_imp_d             0
category_name          0
name                   0
price                  0
disc_price             0
merchant               0
disc_percentage        0
is_sale                0
imp_count              0
brand                  0
p_description          0
date_added             0
date_seen              0
date_updated           0
manufacturer       10639
day_n                  0
month                  0
month_n                0
day                    0
week_number            0
dtype: int64

## 1.5. Change Types

In [23]:
df1['date_imp_d'] = pd.to_datetime(df1['date_imp_d'])

In [24]:
df1.dtypes

date_imp_d         datetime64[ns]
category_name              object
name                       object
price                     float64
disc_price                float64
merchant                   object
disc_percentage           float64
is_sale                    object
imp_count                   int64
brand                      object
p_description              object
date_added                 object
date_seen                  object
date_updated               object
manufacturer               object
day_n                      object
month                       int64
month_n                    object
day                         int64
week_number                 int64
dtype: object

## 1.6. Descriptive Statistical

In [26]:
num_attributes = df1.select_dtypes( include=['int64', 'float64'])
cat_attributes = df1.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

### 1.6.1. Numerical Attributes

In [29]:
# Tendencia Central - media, mediana
ct1 = pd.DataFrame( num_attributes.apply( np.mean) ).T  # media
ct2 = pd.DataFrame( num_attributes.apply( np.median) ).T # mediana


# Disperção - desvio padrão, min, max, intervalo, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std) ).T                              # desvio padrão
d2 = pd.DataFrame( num_attributes.apply( min ) ).T                           # minimo
d3 = pd.DataFrame( num_attributes.apply( max ) ).T                           # maximo
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T     # intervalo
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T              # skew
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T          # kurtosis

# Concatenar os resultados
m1 = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
m1.columns = ['Atributos', 'min', 'max', 'intervalo', 'media', 'mediana', 'std', 'skew', 'kurtosis']
m1

Unnamed: 0,Atributos,min,max,intervalo,media,mediana,std,skew,kurtosis
0,price,1.0,10879.95,10878.95,513.037803,199.99,859.091448,4.59516,34.411146
1,disc_price,1.0,10879.95,10878.95,494.104295,199.0,808.571509,4.748349,39.612839
2,disc_percentage,0.0,0.8,0.8,0.016933,0.0,0.077847,5.217632,28.75492
3,imp_count,1.0,31.0,30.0,6.560926,5.0,5.403049,1.286351,1.639569
4,month,1.0,12.0,11.0,7.65103,8.0,2.592787,-0.099417,-0.628859
5,day,1.0,31.0,30.0,15.693879,16.0,9.681413,0.032258,-1.331838
6,week_number,1.0,51.0,50.0,31.34275,32.0,11.316253,-0.26698,-0.648027
