<a href="https://colab.research.google.com/github/Rogerio-mack/IMT_CD_2024/blob/main/IMT_Lab_hot_encode_scale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<head>
  <meta name="author" content="Rogério de Oliveira">
  <meta institution="author" content="ITM">
</head>

<img src="https://maua.br/images/selo-60-anos-maua.svg" width=300, align="right">
<!-- <h1 align=left><font size = 6, style="color:rgb(200,0,0)"> optional title </font></h1> -->


# Estimadores de Hot Label Encode e Scale

## imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import statsmodels.formula.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


# Hot encode dos atributos não numéricos

## Com o `Pandas`

Mas prefira fazer com o `scikit-learn`

In [3]:
df_encoded = pd.get_dummies(df, columns=['island', 'sex', 'species'], drop_first=True)
df_encoded.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_Male,species_Chinstrap,species_Gentoo
0,39.1,18.7,181.0,3750.0,False,True,True,False,False
1,39.5,17.4,186.0,3800.0,False,True,False,False,False
2,40.3,18.0,195.0,3250.0,False,True,False,False,False
3,,,,,False,True,False,False,False
4,36.7,19.3,193.0,3450.0,False,True,False,False,False


# Com o `scikit-learn`

In [5]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [9]:
from sklearn.preprocessing import OneHotEncoder

# Cria o OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

# Aplica o OneHotEncoder nas colunas categóricas
encoder.fit(df[['island', 'sex', 'species']])
encoded_data = encoder.transform(df[['island', 'sex', 'species']])
# encoded_data = encoder.fit_transform(df[['island', 'sex', 'species']])

encoded_data

array([[0., 1., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1.]])

In [11]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Concatena o DataFrame original (sem as colunas categóricas) com o DataFrame codificado
df_encoded = pd.concat([df.drop(['island', 'sex', 'species'], axis=1), encoded_df], axis=1)

df_encoded.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_Male,sex_nan,species_Chinstrap,species_Gentoo
0,39.1,18.7,181.0,3750.0,0.0,1.0,1.0,0.0,0.0,0.0
1,39.5,17.4,186.0,3800.0,0.0,1.0,0.0,0.0,0.0,0.0
2,40.3,18.0,195.0,3250.0,0.0,1.0,0.0,0.0,0.0,0.0
3,,,,,0.0,1.0,0.0,1.0,0.0,0.0
4,36.7,19.3,193.0,3450.0,0.0,1.0,0.0,0.0,0.0,0.0


## Outros commandos úteis

In [13]:
df.select_dtypes(include='object').columns

Index(['species', 'island', 'sex'], dtype='object')

In [12]:
encoder.feature_names_in_

array(['island', 'sex', 'species'], dtype=object)

# Persistindo o Estimador

In [17]:
import joblib

# Salvando o modelo
joblib.dump(encoder, 'encoder.pkl')

# Recuperando modelo
encoder = joblib.load('encoder.pkl')
encoder.get_feature_names_out()

array(['island_Dream', 'island_Torgersen', 'sex_Male', 'sex_nan',
       'species_Chinstrap', 'species_Gentoo'], dtype=object)

# Aplicando um modelo ML

Ooops! Valores ausentes

In [22]:
df_encoded = df_encoded.dropna()

In [23]:
X = df_encoded.drop(columns=['body_mass_g'])
y = df_encoded['body_mass_g']

model = LinearRegression()
model.fit(X, y)

print("Coeficientes: ", dict(zip(model.feature_names_in_, model.coef_)))
print("Intercept: ", model.intercept_)
print("Score (R2): ", model.score(X, y))

Coeficientes:  {'bill_length_mm': 20.00993836410321, 'bill_depth_mm': 70.52979499482603, 'flipper_length_mm': 15.830902053361005, 'island_Dream': -17.77304154023821, 'island_Torgersen': -24.261509874675646, 'sex_Male': 379.1273438433128, 'sex_nan': -27.055622613129103, 'species_Chinstrap': -270.86894893979826, 'species_Gentoo': 987.6111910851722}
Intercept:  -1544.2586708350282
Score (R2):  0.8723738276892019
