# Atividade (1.0 pt):  Pré-Processamento dos dados de COVID-19 no Piauí (equipe 5 integrantes)

Apresentar um jupyter notebook gere 4 pickles X_train.pickle, y_train.pickle, X_test.pickle e y_test.pickle referente ao dados pré-processados considerando o dataset dos casos de COVID-19 no estado do Piauí. Considere que que o alvo (y) do dataset é o atributo número de mortes (deaths) 

## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle


## Carregando os Dados

In [2]:
DATASET_PATH = os.path.join(os.getcwd(), 'datasets')
DATASET_NAME = 'casos_atualizados.csv'


In [3]:
def load_data(dataset_path=DATASET_PATH, dataset_name=DATASET_NAME):
    csv_path = os.path.join(dataset_path, dataset_name)
    return pd.read_csv(csv_path)


In [4]:
dataset = load_data(DATASET_PATH, DATASET_NAME)


## Tratando os dados faltantes

In [5]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   city                            224 non-null    object 
 1   city_ibge_code                  225 non-null    int64  
 2   confirmed                       225 non-null    int64  
 3   confirmed_per_100k_inhabitants  225 non-null    float64
 4   date                            225 non-null    object 
 5   death_rate                      225 non-null    float64
 6   deaths                          225 non-null    int64  
 7   estimated_population            225 non-null    int64  
 8   estimated_population_2019       225 non-null    int64  
 9   is_last                         225 non-null    bool   
 10  order_for_place                 225 non-null    int64  
 11  place_type                      225 non-null    object 
 12  state                           225 

In [6]:
# Filtrando Dados Faltantes
dataset_t = dataset.dropna(subset=["city"])
dataset_t.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 1 to 224
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   city                            224 non-null    object 
 1   city_ibge_code                  224 non-null    int64  
 2   confirmed                       224 non-null    int64  
 3   confirmed_per_100k_inhabitants  224 non-null    float64
 4   date                            224 non-null    object 
 5   death_rate                      224 non-null    float64
 6   deaths                          224 non-null    int64  
 7   estimated_population            224 non-null    int64  
 8   estimated_population_2019       224 non-null    int64  
 9   is_last                         224 non-null    bool   
 10  order_for_place                 224 non-null    int64  
 11  place_type                      224 non-null    object 
 12  state                           224 

## Tratando os dados categóricos

In [7]:
dataset_c = dataset_t.drop(['state', 'place_type', 'is_last', 'date', 'city'],axis=1)
dataset_c.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 1 to 224
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   city_ibge_code                  224 non-null    int64  
 1   confirmed                       224 non-null    int64  
 2   confirmed_per_100k_inhabitants  224 non-null    float64
 3   death_rate                      224 non-null    float64
 4   deaths                          224 non-null    int64  
 5   estimated_population            224 non-null    int64  
 6   estimated_population_2019       224 non-null    int64  
 7   order_for_place                 224 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 15.8 KB


## Dividando os dados em treino e teste

### Separando o alvo

In [8]:
dataset_target = dataset_c['deaths'].copy()
dataset_c = dataset_c.drop('deaths', axis=1)
dataset_c


Unnamed: 0,city_ibge_code,confirmed,confirmed_per_100k_inhabitants,death_rate,estimated_population,estimated_population_2019,order_for_place
1,2200053,228,3210.36328,0.0088,7102,7084,633
2,2200103,829,16156.69460,0.0121,5131,5139,625
3,2200202,1699,9725.24327,0.0453,17470,17411,674
4,2200251,402,5244.61840,0.0323,7665,7651,639
5,2200277,553,11244.40830,0.0163,4918,4915,621
...,...,...,...,...,...,...,...
220,2211357,204,4131.22722,0.0588,4938,4947,641
221,2211407,561,12790.69767,0.0018,4386,4391,672
222,2211506,399,12954.54545,0.0075,3080,3077,656
223,2211605,349,11822.49322,0.0086,2952,2971,664


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset_c, dataset_target, test_size = 0.2, random_state=1, shuffle=True)


In [10]:
X_train.shape


(179, 7)

In [11]:
X_test.shape


(45, 7)

In [12]:
224 * 0.8


179.20000000000002

## Feature Scaling

In [13]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


## Salvando os conjuntos em pickle

In [14]:
pickle.dump(X_train, open('X_train.pickle', 'wb'))
pickle.dump(X_test, open('X_test.pickle', 'wb'))
pickle.dump(Y_train, open('y_train.pickle', 'wb'))
pickle.dump(Y_test, open('y_test.pickle', 'wb'))
