## Ligando o GoogleDrive

In [2]:
from google.colab import drive
drive.mount('/content/drive')
caminho = "/content/drive/MyDrive/CDSI/MaterialApoio/"

Mounted at /content/drive


## Bibliotecas

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pickle

In [4]:
base_censo = pd.read_csv(caminho + "census.csv")

In [None]:
# age - discreta
# workclass(não ordinal é nominal)
# final - é continua
# education - ordinal (interpretação)
# Anos - é discreto (número de anos)
# estado civil, posição na familia - nominal
# ocupação, raça, sexo - nominais
# horas trabalhadas - discreta
# pais - nominal
# classe - é ordinal
base_censo.describe()

In [5]:
base_censo.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
final-weight,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0


## Divisão entre Previsores e classe

In [6]:
base_censo.columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')

In [17]:
X_census = base_censo.iloc[:, 0:14].values

In [18]:
X_census

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [12]:
y_census = base_censo.iloc[:, 14].values

In [13]:
y_census

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

## Tratamento de atributos categóricos

## LablEncoder
Tranforma em números inteiros de cada atributo categórico não binario (mais de 2 categrias)

In [19]:
X_census[:,1]

array([' State-gov', ' Self-emp-not-inc', ' Private', ..., ' Private',
       ' Private', ' Self-emp-inc'], dtype=object)

In [20]:
label_encoder_teste = LabelEncoder()
teste = label_encoder_teste.fit_transform(X_census[:,1])
teste

array([7, 6, 4, ..., 4, 4, 5])

In [24]:
X_census[0]

array([39, 7, 77516, ' Bachelors', 13, ' Never-married', ' Adm-clerical',
       ' Not-in-family', ' White', ' Male', 2174, 0, 40, ' United-States'],
      dtype=object)

In [25]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

In [26]:
X_census [:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census [:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census [:,5] = label_encoder_marital.fit_transform(X_census[:,5])
X_census [:,6] = label_encoder_occupation.fit_transform(X_census [:,6])
X_census [:,7] = label_encoder_relationship.fit_transform(X_census[:,7])
X_census [:,8] = label_encoder_race.fit_transform(X_census[:,8])
X_census [:,9] = label_encoder_sex.fit_transform(X_census[:,9])
X_census [:,13] = label_encoder_country.fit_transform(X_census[:,13])

In [27]:
X_census[0]

array([39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39], dtype=object)

In [28]:
X_census

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

## OneHotEncoder
Trasforma em números binarios (0 e 1) as categorias de cada atributo categórico não binário (mais de 2 categorias). Cada categoria de um atributo se transforma em uma nova coluna(atributo). Exemplo: atributo 'workclass' com 9 categorias se transforma em 9 novos atributos. Cada linha terá valor 1 na coluna referente à categoria original e 0 nas outras colunas.

In [29]:
np.unique(base_censo['workclass'])

array([' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
       ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
      dtype=object)

In [30]:
len(np.unique(base_censo['workclass']))

9

In [31]:
len(np.unique(base_censo['occupation']))

15

In [35]:
# as outras colunas serão apagadas
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough')

In [36]:
X_census = onehotencoder.fit_transform(X_census).toarray()

In [37]:
X_census

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1740e+03, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5024e+04, 0.0000e+00,
        4.0000e+01]])

In [38]:
X_census[0]

array([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e

In [39]:
X_census.shape

(32561, 108)

## Escalonamento dos valores

De: Valores com escalas difrentees (ex: age 17-90, education-num 1-16, hour-per-week 1-99)
Para: Valores com médias próximas de 0 e desvios padrão próximos de 1

In [40]:
scaler_census = StandardScaler()
X_census = scaler_census.fit_transform(X_census)

In [41]:
X_census[0]

array([-0.2444502 , -0.17429511, -0.26209736, -0.01466381, -1.5167923 ,
       -0.18838933, -0.29093568,  4.90769968, -0.02073999, -0.17175325,
       -0.19348662, -0.11609195, -0.07201601, -0.10164955, -0.1422718 ,
       -0.12664495, -0.18406376, -0.21053433,  2.25399324, -0.11334387,
       -0.68994199, -0.23637391, -0.03960742, -0.13419553, -0.53714425,
       -0.39750806, -0.02658695, -0.92284068, -0.11403678,  1.43105786,
       -0.1802846 , -0.17735813, -0.24494366,  2.76348874, -0.01662771,
       -0.37949517, -0.37774555, -0.17745022, -0.20957797, -0.25595432,
       -0.33554133, -0.06780164, -0.38166338, -0.14260848, -0.35531609,
       -0.17127887, -0.22710355, -0.82533335,  1.70899099, -0.17624972,
       -0.42934582, -0.34403232, -0.22492681, -0.09820087, -0.18155194,
       -0.32576824, -0.09161163,  0.4130197 , -0.70307135,  0.70307135,
       -0.13502327, -0.02416321, -0.06107342, -0.0480488 , -0.04260602,
       -0.05409379, -0.04641598, -0.02933708, -0.05714946, -0.05

## Divisão da base em treinamento e teste

In [42]:
X_census_treinamento, X_census_teste, y_census_treinamento, y_census_teste = train_test_split(X_census, y_census, test_size = 0.15, random_state = 0)

In [43]:
X_census_treinamento.shape, y_census_treinamento.shape

((27676, 108), (27676,))

## Salvar as variáveis

In [44]:
with open (caminho + 'census.pkl', mode = 'wb') as f:
  pickle.dump([X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste], f)