# Importando bibliotecas

## Hyperparameter Tuning using GridSearchCV
In this section, GridSearchCV is applied to tune the hyperparameters of the model to optimize its performance. We are using recall as the evaluation metric.

In [62]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [63]:
warnings.filterwarnings("ignore")

In [64]:
test_x = pd.read_csv("data/data_base/titanic/test.csv")
test_y = pd.read_csv("data/data_base/titanic/gender_submission.csv")
train = pd.read_csv("data/data_base/titanic/train.csv")

In [65]:
test = test_y.merge(test_x, on="PassengerId", how="inner")
df = pd.concat([train,  test])

# Funções utilitária

In [66]:
def eval_model(pred, real):
  dict_metrics = {
      "acc": accuracy_score(real, pred),
      "precision": precision_score(real, pred),
      "f1":  f1_score(real, pred),
      "recall": recall_score(real, pred)
  }

  return dict_metrics

# Exploração dos Dados

In [67]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [68]:
print(f"Total de Linhas: {df.shape[0]}\n")
print("Valores Únicos por Coluna:")
unique_values = df.nunique()
for col, unique in unique_values.items():
    print(f"{col}: {unique}")


Total de Linhas: 1309

Valores Únicos por Coluna:
PassengerId: 1309
Survived: 2
Pclass: 3
Name: 1307
Sex: 2
Age: 98
SibSp: 7
Parch: 8
Ticket: 929
Fare: 281
Cabin: 186
Embarked: 3


In [69]:
df.isna().sum() / df.shape[0]

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.200917
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000764
Cabin          0.774637
Embarked       0.001528
dtype: float64

In [70]:
tr_size, te_size = train.shape[0], test.shape[0]
total_rows = tr_size + te_size
cols = train.shape[1]

print(f"{'#' * 5} Data {'#' * 5}")
print(f"Total Rows: {total_rows}\nTotal Columns: {cols}")
print(f"Train Size: {tr_size} rows ({(tr_size / total_rows * 100):.2f}%)")
print(f"Test Size: {te_size} rows ({(te_size / total_rows * 100):.2f}%)")


##### Data #####
Total Rows: 1309
Total Columns: 12
Train Size: 891 rows (68.07%)
Test Size: 418 rows (31.93%)


# Criação de Modelos Baseline

In [71]:
tr_size, te_size = train.shape[0], test.shape[0]
total_rows = tr_size + te_size
cols = train.shape[1]

In [72]:
print(f"{'#' * 5} Data {'#' * 5}")
print(f"Total Rows: {total_rows}\nTotal Columns: {cols}")
print(f"Train Size: {tr_size} rows ({(tr_size / total_rows * 100):.2f}%)")
print(f"Test Size: {te_size} rows ({(te_size / total_rows * 100):.2f}%)")

##### Data #####
Total Rows: 1309
Total Columns: 12
Train Size: 891 rows (68.07%)
Test Size: 418 rows (31.93%)


##  Testando Engenharia de Atributos

In [73]:
# Agrupando idades por intervalos de 10 anos
df['Age'] = df['Age'].apply(lambda x: (int(x * 0.1) * 10) if pd.notna(x) else np.NaN)

# Separando as informações do ticket
df['Zone'] = df['Cabin'].apply(lambda x: x[0] if isinstance(x, str) and len(x) > 0 else None)
df['cabin_number'] = df['Cabin'].apply(lambda x: x[1:] if isinstance(x, str) and len(x) > 1 else None)

In [74]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Zone,cabin_number
0,1,0,3,"Braund, Mr. Owen Harris",male,20.0,1,0,A/5 21171,7.25,,S,,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,30.0,1,0,PC 17599,71.2833,C85,C,C,85.0
2,3,1,3,"Heikkinen, Miss. Laina",female,20.0,0,0,STON/O2. 3101282,7.925,,S,,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,30.0,1,0,113803,53.1,C123,S,C,123.0
4,5,0,3,"Allen, Mr. William Henry",male,30.0,0,0,373450,8.05,,S,,


## Definição de Variáveis Preditivas e Variável Alvo

In [75]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [76]:
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

## Definindo o pré-processamento para variáveis categóricas e numéricas

In [77]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])


## Criando modelos para comparação

In [78]:
models = {
    "Random Forest": RandomForestClassifier(random_state=132),
    "Logistic Regression": LogisticRegression(random_state=132, max_iter=1000),
    "Gradient Boosting": GradientBoostingClassifier(random_state=132)
}

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=132)

In [80]:
best_model = None
best_score = 0

In [81]:
for name, model in models.items():
    # Criando pipeline com o pré-processamento e o modelo
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    # Treinamento
    pipeline.fit(X_train, y_train)
    
    # Previsões
    y_pred = pipeline.predict(X_test)
    
    # Avaliando a acurácia
    score = accuracy_score(y_test, y_pred)
    print(f"{name} - Acurácia: {score:.4f}")
    
    # Salvando o melhor modelo
    if score > best_score:
        best_score = score
        best_model = pipeline


Random Forest - Acurácia: 0.8817
Logistic Regression - Acurácia: 0.8664
Gradient Boosting - Acurácia: 0.8855


In [82]:
print(f"\nMelhor Modelo: {best_model.named_steps['model'].__class__.__name__} com acurácia de {best_score:.4f}")
y_pred_best = best_model.predict(X_test)
print(classification_report(y_test, y_pred_best))


Melhor Modelo: GradientBoostingClassifier com acurácia de 0.8855
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       182
           1       0.80      0.84      0.82        80

    accuracy                           0.89       262
   macro avg       0.86      0.87      0.87       262
weighted avg       0.89      0.89      0.89       262



#  Regressão Logística

In [83]:
test = test_y.merge(test_x, on="PassengerId", how="inner")
df = pd.concat([train,  test])

In [84]:
df.is_male = df.Sex.apply(lambda x: 1 if x == "male" else 0)
df.drop("Sex", axis=1, inplace=True)

In [85]:
df.Age = df.Age.apply(lambda x: int(x*0.1)*10 if pd.notna(x) else np.NaN)

df['Zone'] = df.Cabin.apply(lambda x: x[0] if not isinstance(x, float) else None)
df['cabin_number'] = df.Cabin.apply(lambda x: x[1:] if x and not isinstance(x, float) and len(x) > 1 else None)

le = LabelEncoder()
df["Zone"] = le.fit_transform(df["Zone"])

df.drop(["Cabin", "Name"], axis=1, inplace=True)

In [86]:
df["Embarked"] = le.fit_transform(df["Embarked"])
df["cabin_number"] = le.fit_transform(df["cabin_number"])

In [87]:
df.drop("Ticket", axis=1, inplace=True)

In [88]:
df.isna().sum()

PassengerId       0
Survived          0
Pclass            0
Age             263
SibSp             0
Parch             0
Fare              1
Embarked          0
Zone              0
cabin_number      0
dtype: int64

In [89]:
df.Age.fillna(df.Age.mean(), inplace=True)
df.dropna(subset=["Fare"], inplace=True)

In [90]:
df.isna().sum()

PassengerId     0
Survived        0
Pclass          0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked        0
Zone            0
cabin_number    0
dtype: int64

In [91]:
train = df.loc[train.index]
train_x, train_y = train.drop("Survived", axis=1), train["Survived"]

test = df.loc[test.index]
test_x, test_y = test.drop("Survived", axis=1), test["Survived"]

## Modelo: Regressão Logística

In [92]:
model = LogisticRegression()
model.fit(train_x, train_y)

In [93]:
pred = model.predict(test_x)

In [94]:
eval_model(pred=pred, real=test_y)

{'acc': 0.6730538922155689,
 'precision': 0.64,
 'f1': 0.4129032258064516,
 'recall': 0.3047619047619048}

Resultados estão distantes de satisfatório

# Random Forest

In [95]:
model_rf = RandomForestClassifier()
model_rf.fit(train_x, train_y)

In [96]:
pred_rf = model_rf.predict(test_x)

In [97]:
eval_model(pred=pred_rf, real=test_y)

{'acc': 1.0, 'precision': 1.0, 'f1': 1.0, 'recall': 1.0}

Possivel overfit

## Explorando Overfit

In [98]:
pred_rf_train = model_rf.predict(train_x)

In [99]:
eval_model(pred=pred_rf_train, real=train_y)

{'acc': 1.0, 'precision': 1.0, 'f1': 1.0, 'recall': 1.0}

In [100]:
X, y = df.drop("Survived", axis=1), df["Survived"]
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Acurácia = {scores.mean()}")

Acurácia = 0.6742856307215349


Overfit confirmado!

# Gradient Boosting

In [102]:
model_gb = GradientBoostingClassifier()
model_gb.fit(train_x, train_y)

In [103]:
pred_gb = model_gb.predict(test_x)

In [104]:
eval_model(pred=pred_gb, real=test_y)

{'acc': 0.7988023952095809,
 'precision': 0.8418604651162791,
 'f1': 0.6830188679245283,
 'recall': 0.5746031746031746}

## Ajustando Hiperparâmetros

In [105]:
param_grid = {
    'n_estimators': [100, 200, 300],         
    'max_depth': [3, 5, 7],               
    'learning_rate': [0.01, 0.1, 0.2],    
    'subsample': [0.8, 1.0],              
    'min_samples_split': [2, 5, 10],      
}


In [106]:
grid_search.fit(train_x, train_y)
print(f"Melhores Hiperparâmetros: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test_x)
print(f"Recall na base de teste: {recall_score(test_y, y_pred):.4f}")


Melhores Hiperparâmetros: {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.5}
Recall na base de teste: 0.9365


In [111]:
best_model = grid_search.best_estimator_

## Modelo final

In [112]:
model_gb = GradientBoostingClassifier(
    learning_rate = 0.3,
    max_depth = 3,
    min_samples_leaf = 6,
    min_samples_split = 2,
    n_estimators = 100,
    subsample = 0.8
)

model_gb.fit(train_x, train_y)

In [113]:
pred = model_gb.predict(test_x)

In [114]:
eval_model(pred=pred, real=test_y)

{'acc': 0.8610778443113772,
 'precision': 0.8812260536398467,
 'f1': 0.7986111111111112,
 'recall': 0.7301587301587301}