<a href="https://colab.research.google.com/github/Rogerio-mack/IMT_CD_2025/blob/main/IMT_ex_hot_null_encode_scale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<head>
  <meta name="author" content="Rogério de Oliveira">
  <meta institution="author" content="ITM">
</head>

<img src="https://maua.br/images/selo-60-anos-maua.svg" width=300, align="right">
<!-- <h1 align=left><font size = 6, style="color:rgb(200,0,0)"> optional title </font></h1> -->


# Preparação dos dados: Nulos, Hot & Label Encode, Scale

## imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#@markdown Just run

df = sns.load_dataset('penguins')

df_mass = df.sample(n=3, random_state=42)
df_mass.drop(columns=['body_mass_g'], inplace=True)
df_mass.reset_index(drop=True, inplace=True)
df = df.drop(df_mass.index)
df.reset_index(drop=True, inplace=True)

df_species = df.sample(n=3, random_state=1)
df_species.drop(columns=['species'], inplace=True)
df_species.reset_index(drop=True, inplace=True)
df = df.drop(df_mass.index)
df.reset_index(drop=True, inplace=True)

df.iloc[np.random.choice(len(df), 2, replace=False),2] = np.nan
df.iloc[np.random.choice(len(df), 3, replace=False),3] = np.nan
df.iloc[np.random.choice(len(df), 4, replace=False),4] = np.nan


In [3]:
display(df.head())
print("Shape of df:", df.shape)
print()
display(df_mass)
print("Shape of df_mass:", df_mass.shape)
print()
display(df_species)
print("Shape of df_species:", df_species.shape)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
1,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
2,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
3,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
4,Adelie,Torgersen,37.8,17.1,186.0,3300.0,


Shape of df: (338, 7)



Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,sex
0,Chinstrap,Dream,50.9,19.1,196.0,Male
1,Chinstrap,Dream,45.2,17.8,198.0,Female
2,Gentoo,Biscoe,46.5,13.5,210.0,Female


Shape of df_mass: (3, 6)



Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Biscoe,39.7,18.9,184.0,3550.0,Male
1,Torgersen,39.0,17.1,191.0,3050.0,Female
2,Torgersen,34.6,21.1,198.0,4400.0,Male


Shape of df_species: (3, 6)


# 1. Tratamento de nulos

Altere os dados acima preenchendo os valores nulos com valores que melhor representam a distribuição geral dos dados.

In [4]:
df.isna().sum()

Unnamed: 0,0
species,0
island,0
bill_length_mm,3
bill_depth_mm,4
flipper_length_mm,5
body_mass_g,1
sex,10


In [5]:
df[ df.isna().any(axis=1) ]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
2,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
3,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
4,Adelie,Torgersen,37.8,17.1,186.0,3300.0,
5,Adelie,Torgersen,37.8,17.3,180.0,3700.0,
20,Adelie,Biscoe,40.6,18.6,,3550.0,Male
25,Adelie,Dream,,18.1,178.0,3900.0,Male
40,Adelie,Dream,41.1,,182.0,3425.0,Male
41,Adelie,Dream,37.5,18.9,179.0,2975.0,
106,Adelie,Biscoe,39.7,17.7,,3200.0,Female
110,Adelie,Torgersen,,17.0,188.0,2900.0,Female


In [6]:
np.random.seed(42)

for c in df:
  # Get the non-null 'sex' values from rows without any missing values
  non_null_values = df[~df.isna().any(axis=1)][c].dropna()

  # Replace the missing 'sex' values with random samples from the non-null values
  df[c] = df[c].apply(lambda x: np.random.choice(non_null_values) if pd.isna(x) else x)

# Display the rows with missing values to confirm 'sex' is filled
display(df[df.isna().any(axis=1)])

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex


# 2. Hot & Label encode

Empregando os estimadores do scikit-learn, faça o label enconde de sex e o hot encode dos demais dados categóricos de df.

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Select the columns to hot encode
categorical_cols = ['island', 'species']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

# Fit and transform the selected columns
encoded_data = encoder.fit_transform(df[categorical_cols])

# Create a new DataFrame with the encoded data and appropriate column names
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns from df
df = df.drop(columns=categorical_cols)

# Concatenate the original df with the encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

display(df.head())

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island_Dream,island_Torgersen,species_Chinstrap,species_Gentoo
0,38.9,17.8,181.0,3625.0,Female,0.0,1.0,0.0,0.0
1,39.2,19.6,195.0,4675.0,Male,0.0,1.0,0.0,0.0
2,34.1,18.1,193.0,3475.0,Female,0.0,1.0,0.0,0.0
3,42.0,20.2,190.0,4250.0,Male,0.0,1.0,0.0,0.0
4,37.8,17.1,186.0,3300.0,Male,0.0,1.0,0.0,0.0


In [8]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'sex' column
df['sex'] = label_encoder.fit_transform(df['sex'])

display(df.head())

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island_Dream,island_Torgersen,species_Chinstrap,species_Gentoo
0,38.9,17.8,181.0,3625.0,0,0.0,1.0,0.0,0.0
1,39.2,19.6,195.0,4675.0,1,0.0,1.0,0.0,0.0
2,34.1,18.1,193.0,3475.0,0,0.0,1.0,0.0,0.0
3,42.0,20.2,190.0,4250.0,1,0.0,1.0,0.0,0.0
4,37.8,17.1,186.0,3300.0,1,0.0,1.0,0.0,0.0


# 3. Scale, Regressão Linear

3.1. Faça um modelo de regressão linear para predição do peso dos pinguins aplicando em df_mass.

3.2. Repita para diferentes normalizações dos dados.

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define features (X) and target (y)
X = df.drop('body_mass_g', axis=1)
y = df['body_mass_g']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Evaluate the model
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 93729.96715413703
R-squared: 0.8544607112958614


## **Cuidado!**

In [14]:
%%script echo skipped... forma errada de aplicar
# Fit and transform the selected columns
encoded_data_mass = encoder.fit_transform(df_mass[categorical_cols])

# Create a new DataFrame with the encoded data and appropriate column names
encoded_df_mass = pd.DataFrame(encoded_data_mass, columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns from df
df_mass = df_mass.drop(columns=categorical_cols)

# Concatenate the original df with the encoded DataFrame
df_mass = pd.concat([df_mass, encoded_df_mass], axis=1)

df_mass['sex'] = label_encoder.fit_transform(df_mass['sex'])

df_mass.head()

skipped... forma errada de aplicar


In [15]:
#
# forma correta de aplicar... only transform
#
# Fit and transform the selected columns
encoded_data_mass = encoder.transform(df_mass[categorical_cols])

# Create a new DataFrame with the encoded data and appropriate column names
encoded_df_mass = pd.DataFrame(encoded_data_mass, columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns from df
df_mass = df_mass.drop(columns=categorical_cols)

# Concatenate the original df with the encoded DataFrame
df_mass = pd.concat([df_mass, encoded_df_mass], axis=1)

df_mass['sex'] = label_encoder.transform(df_mass['sex'])

df_mass.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,sex,island_Dream,island_Torgersen,species_Chinstrap,species_Gentoo
0,50.9,19.1,196.0,1,1.0,0.0,1.0,0.0
1,45.2,17.8,198.0,0,1.0,0.0,1.0,0.0
2,46.5,13.5,210.0,0,0.0,0.0,0.0,1.0


In [16]:
model.predict(df_mass)

array([4016.03729513, 3457.00266189, 4642.9239245 ])

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define features (X) and target (y)
X = df.drop('body_mass_g', axis=1)
y = df['body_mass_g']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Initialize and train the Linear Regression model on scaled data
model_scaled = LinearRegression()
model_scaled.fit(X_scaled, y)

# Make predictions on the scaled data
y_pred_scaled = model_scaled.predict(X_scaled)

# Evaluate the model
mse_scaled = mean_squared_error(y, y_pred_scaled)
r2_scaled = r2_score(y, y_pred_scaled)

print(f"Mean Squared Error (Scaled Data): {mse_scaled}")
print(f"R-squared (Scaled Data): {r2_scaled}")

Mean Squared Error (Scaled Data): 93729.96715413705
R-squared (Scaled Data): 0.8544607112958613


In [18]:
model_scaled.predict(scaler.transform(df_mass))

array([4016.03729513, 3457.00266189, 4642.9239245 ])

# 4. Scale, Regressão Logística

3.1. Faça um modelo de regressão logística para predição da espécie dos pinguins aplicando em df_species.

3.2. Qual a acuracidade do modelo?

3.3. Repita para diferentes normalizações dos dados.

**Este, como falamos, é praticamente uma reprodução dos exercícios anteriores, não é portanto necessário um gabarito**.