# Automatizando o Pré-processamento com SKLearn

## Etapa de análise exploratória
Não repetiremos aqui a análise exploratória feita anteriormente. Faremos apenas a carga dos dados e o pré-processamento.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('C:/Users/Alberto Pinalli/Documents/GitHub/SGBD\Bike-Sharing-Dataset/day.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


## Etapa de pré-processamento

### Separando os conjuntos de treino e teste

In [4]:
# !pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1320 sha256=53d0d912db158d197e81b62ed8826250df12398fb7174f59f1f58fb2b540cfd4
  Stored in directory: c:\users\alberto pinalli\appdata\local\pip\cache\wheels\22\0b\40\fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [25]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [27]:
from sklearn.model_selection import train_test_split

# separando os conjuntos de dados de treino e teste
df_treino, df_teste = train_test_split(df, test_size = 0.2, random_state = 42)

# separando a coluna alvo do conjunto de treino
df_treino_labels = df_treino['cnt'].copy()
df_treino        = df_treino.drop(columns='cnt')

# separando a coluna alvo do conjunto de teste
df_teste_labels = df_teste['cnt'].copy()
df_teste        = df_teste.drop(columns='cnt')

In [29]:
df_treino.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
682,683,2012-11-13,4,1,11,0,2,1,2,0.343333,0.323225,0.662917,0.342046,327,3767
250,251,2011-09-08,3,0,9,0,4,1,3,0.633913,0.555361,0.939565,0.192748,153,1689
336,337,2011-12-03,4,0,12,0,6,0,1,0.299167,0.310604,0.612917,0.095783,706,2908
260,261,2011-09-18,3,0,9,0,0,0,1,0.5075,0.490537,0.695,0.178483,1353,2921
543,544,2012-06-27,3,1,6,0,3,1,1,0.6975,0.640792,0.36,0.271775,1077,6258


### Automação do processo de pré-processamento

In [6]:
from sklearn.impute import SimpleImputer  # PREENCHER DADOS FALTANTES
from sklearn.preprocessing import StandardScaler # NORMALIZAR
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
nomes_atributos_numericos   = ['temp', 'hum', 'windspeed']#->SimpleImputer  -> StandardScaler
nomes_atributos_categoricos = ['season','mnth','weekday','weathersit']# -> OneHotEncoder(binarização)
nomes_atributos_binarios    = ['holiday','workingday']
# concatenar MATRIX X (ColumnTransformer)

In [39]:
pipeline_atr_numericos = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

preproc_completo = ColumnTransformer([
    ('numericos',   pipeline_atr_numericos,                  nomes_atributos_numericos),
    ('binarios',    'passthrough',                           nomes_atributos_binarios),
    ('categoricos', OneHotEncoder(handle_unknown='ignore'),  nomes_atributos_categoricos),
    ], 
    sparse_threshold = 0)

In [36]:
# pre-processamento do conjunto de treino
X_treino = preproc_completo.fit_transform(df_treino)

# chamar fit_transform é o mesmo que chamar 
# primeiro fit para aprender os parâmetros
# e depois chamar transform para realizar 
# as transformações com base nos parâmetros
# previamente aprendidos, ou seja, chamar:

# preproc_completo.fit(df_treino)
# X_treino = preproc_completo.transform(df_treino)

In [37]:
X_treino[0:2]

<2x31 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [19]:
# pre-processamento do conjunto de teste
X_teste = preproc_completo.transform(df_teste)

In [38]:
X_teste[0:2]

array([[-0.13416911,  0.72208642, -0.21401277,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-1.72667406, -1.36120952,  1.12576527,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ]])

In [21]:
y_treino = df_treino_labels.values.reshape(-1,1)
y_teste  = df_teste_labels.values.reshape(-1,1)

In [22]:
y_treino

array([[4094],
       [1842],
       [3614],
       [4274],
       [7335],
       [4123],
       [4576],
       [1510],
       [3520],
       [4151],
       [6779],
       [2424],
       [4677],
       [6233],
       [4105],
       [4205],
       [5323],
       [1746],
       [3204],
       [6192],
       [1969],
       [3613],
       [4390],
       [7006],
       [7148],
       [1985],
       [3831],
       [4569],
       [5585],
       [4916],
       [5823],
       [4189],
       [1865],
       [1107],
       [7055],
       [7534],
       [5499],
       [7570],
       [8555],
       [7498],
       [2236],
       [5634],
       [1650],
       [7665],
       [3005],
       [4195],
       [5026],
       [2210],
       [1685],
       [3606],
       [1834],
       [1096],
       [5191],
       [4845],
       [6169],
       [5532],
       [5119],
       [4073],
       [2710],
       [8120],
       [2423],
       [7444],
       [6660],
       [7534],
       [3846],
       [5558],
       [34

In [24]:
y_teste

array([[6606],
       [1550],
       [3747],
       [6041],
       [7538],
       [7264],
       [1605],
       [2209],
       [7499],
       [5743],
       [1796],
       [3068],
       [4891],
       [5260],
       [2133],
       [2471],
       [2046],
       [8156],
       [5362],
       [2298],
       [7697],
       [5463],
       [5409],
       [1872],
       [1807],
       [5130],
       [2121],
       [7436],
       [3830],
       [5557],
       [2743],
       [3644],
       [6196],
       [7494],
       [5918],
       [3372],
       [7582],
       [6053],
       [2566],
       [1263],
       [3944],
       [3956],
       [7580],
       [4906],
       [6966],
       [ 705],
       [4458],
       [5298],
       [6043],
       [4996],
       [3351],
       [2431],
       [1011],
       [4475],
       [4725],
       [4727],
       [2395],
       [3351],
       [4788],
       [7175],
       [6153],
       [7442],
       [1471],
       [7865],
       [6530],
       [6211],
       [74