## Modelagem - Predição usando Regressão Logística e Random Forest

In [None]:
# Bibliotecas
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

In [None]:
print(df_limpo.isnull().sum())
print(df_limpo.isnull().sum())


In [None]:
# Pipeline para Regressão Logística
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

# Treinar o modelo
lr_pipeline.fit(X_train, y_train)

# Previsões
y_pred_lr = lr_pipeline.predict(X_test)
y_pred_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1]

# Avaliação
print("Regressão Logística - Métricas:")
print(classification_report(y_test, y_pred_lr))
print("Acurácia:", accuracy_score(y_test, y_pred_lr))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba_lr))

# Matriz de confusão
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de Confusão - Regressão Logística')
plt.show()

In [None]:
# Pipeline para Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

# Treinar o modelo
rf_pipeline.fit(X_train, y_train)

# Previsões
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]

# Avaliação
print("Random Forest - Métricas:")
print(classification_report(y_test, y_pred_rf))
print("Acurácia:", accuracy_score(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba_rf))

# Matriz de confusão
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de Confusão - Random Forest')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

## Função para plotar a curva ROC
def plot_roc_curve(y_true, y_pred_prob, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Linha diagonal
    plt.xlabel('Taxa de Falsos Positivos (FPR)')
    plt.ylabel('Taxa de Verdadeiros Positivos (TPR)')
    plt.title('Curva ROC - Comparação de Modelos')
    plt.legend(loc='lower right')
    plt.grid(True)

## Obter as probabilidades preditas de ambos os modelos
# Regressão Logística
y_pred_prob_lr = lr_pipeline.predict_proba(X_test)[:, 1]

# Random Forest
y_pred_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

## Plotar as curvas ROC
plt.figure(figsize=(10, 8))
plot_roc_curve(y_test, y_pred_prob_lr, 'Regressão Logística')
plot_roc_curve(y_test, y_pred_prob_rf, 'Random Forest')

plt.show()

## Adicionando métricas no gráfico
print("\nMétricas ROC-AUC:")
print(f"Regressão Logística: {roc_auc_score(y_test, y_pred_prob_lr):.4f}")
print(f"Random Forest: {roc_auc_score(y_test, y_pred_prob_rf):.4f}")

In [None]:
# Extrair nomes das features após one-hot encoding
onehot_columns = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(onehot_columns)

# Importância das variáveis
importances = rf_pipeline.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'Feature': all_features, 'Importance': importances})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Plotar as 20 mais importantes
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(20))
plt.title('Top 20 Variáveis Mais Importantes - Random Forest')
plt.show()