In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.preprocessing import LabelEncoder
import os
from imblearn.over_sampling import SMOTE

# Carregar os dados
csv_path = "../data/raw/Titanic-Dataset.csv"
df = pd.read_csv(csv_path)

# Pré-processamento dos dados
# 1. Tratamento de valores ausentes
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace(['Mme'], 'Mrs')
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Dona'], 'Royalty')
df['Title'] = df['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer')
df['Title'] = df['Title'].replace(['Don', 'Sir', 'Jonkheer'], 'Sir')

# Criar feature de tamanho da família
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Preencher valores ausentes
df['Age'] = df.groupby(['Pclass', 'Sex', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))
df['Embarked'] = df.groupby('Pclass')['Embarked'].transform(lambda x: x.fillna(x.mode()[0]))
df['Fare'] = df.groupby(['Pclass', 'FamilySize'])['Fare'].transform(lambda x: x.fillna(x.median()))

# Feature Engineering
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['HasCabin'] = (~df['Cabin'].isna()).astype(int)
df['FarePerPerson'] = df['Fare'] / df['FamilySize']

# Selecionar features para o modelo
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
           'Title', 'FamilySize', 'IsAlone', 'HasCabin', 'FarePerPerson']

# Codificar variáveis categóricas
le = LabelEncoder()
categorical_features = ['Sex', 'Embarked', 'Title']
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Dividir os dados em treino e teste
X = df[features]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Aplicar SMOTE para balancear as classes
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Definir os modelos base
rf = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 3})
gb = GradientBoostingClassifier(random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000, class_weight={0: 1, 1: 3})
svm = SVC(random_state=42, probability=True, class_weight={0: 1, 1: 3})

# Definir os parâmetros para GridSearch
param_grid = {
    'Random Forest': {
        'n_estimators': [300, 400, 500],
        'max_depth': [20, 25, 30],
        'min_samples_split': [2, 3],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [300, 400, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [4, 5, 6],
        'subsample': [0.8, 0.9, 1.0]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

# Dicionário para armazenar os resultados
results = {}
best_models = {}

# Definir StratifiedKFold para validação cruzada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Treinar e avaliar cada modelo com GridSearch
for name, model in [('Random Forest', rf), ('Gradient Boosting', gb),
                   ('Logistic Regression', lr), ('SVM', svm)]:
    print(f"\nTreinando {name}...")
    
    # Realizar GridSearch
    grid_search = GridSearchCV(
        model, 
        param_grid[name], 
        cv=skf, 
        scoring='precision',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    
    # Obter o melhor modelo
    best_model = grid_search.best_estimator_
    best_models[name] = best_model
    
    # Fazer previsões
    y_pred = best_model.predict(X_test)
    
    # Calcular acurácia
    accuracy = accuracy_score(y_test, y_pred)
    
    # Realizar validação cruzada
    cv_scores = cross_val_score(best_model, X, y, cv=skf)
    
    # Armazenar resultados
    results[name] = {
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'best_params': grid_search.best_params_
    }
    
    # Imprimir relatório de classificação
    print(f"\nRelatório de Classificação para {name}:")
    print(classification_report(y_test, y_pred))
    
    # Imprimir resultados da validação cruzada
    print(f"\nResultados da Validação Cruzada para {name}:")
    print(f"Média: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    print(f"Melhores parâmetros: {grid_search.best_params_}")

# Criar um ensemble com Stacking
estimators = [(name, model) for name, model in best_models.items()]
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=skf
)

# Treinar o ensemble
stacking_clf.fit(X_train, y_train)

# Avaliar o ensemble
y_pred_ensemble = stacking_clf.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
cv_scores_ensemble = cross_val_score(stacking_clf, X, y, cv=skf)

results['Stacking'] = {
    'accuracy': accuracy_ensemble,
    'cv_mean': cv_scores_ensemble.mean(),
    'cv_std': cv_scores_ensemble.std()
}

print("\nResultados do Stacking Ensemble:")
print(f"Acurácia: {accuracy_ensemble:.3f}")
print(f"Validação Cruzada: {cv_scores_ensemble.mean():.3f} (+/- {cv_scores_ensemble.std() * 2:.3f})")

# Encontrar o melhor modelo
best_model_name = max(results.items(), key=lambda x: x[1]['cv_mean'])[0]
print(f"\nMelhor modelo: {best_model_name}")
print(f"Acurácia: {results[best_model_name]['accuracy']:.3f}")
print(f"Validação Cruzada: {results[best_model_name]['cv_mean']:.3f} (+/- {results[best_model_name]['cv_std'] * 2:.3f})")

# Salvar o melhor modelo
import joblib
if best_model_name == 'Stacking':
    best_model_instance = stacking_clf
else:
    best_model_instance = best_models[best_model_name]

model_path = os.path.join(os.getcwd(), 'best_model.joblib')
joblib.dump(best_model_instance, model_path)
print(f"\nMelhor modelo salvo em: {model_path}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Avaliar o ensemble
y_pred_ensemble = stacking_clf.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
cv_scores_ensemble = cross_val_score(stacking_clf, X, y, cv=skf)

results['Stacking'] = {
    'accuracy': accuracy_ensemble,
    'cv_mean': cv_scores_ensemble.mean(),
    'cv_std': cv_scores_ensemble.std()
}

# Criar e mostrar matriz de confusão para o ensemble
cm_ensemble = confusion_matrix(y_test, y_pred_ensemble)
plt.figure(figsize=(6, 6))
sns.heatmap(cm_ensemble, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Não Sobreviveu', 'Sobreviveu'],
            yticklabels=['Não Sobreviveu', 'Sobreviveu'])
plt.title('Matriz de Confusão - Stacking Ensemble')
plt.ylabel('Verdadeiro')
plt.xlabel('Predito')
plt.show()

print("\nResultados do Stacking Ensemble:")
print(f"Acurácia: {accuracy_ensemble:.3f}")
print(f"Validação Cruzada: {cv_scores_ensemble.mean():.3f} (+/- {cv_scores_ensemble.std() * 2:.3f})")
