In [2]:
import pandas as pd
import numpy as np

# Carregar os dados
url = "../data/raw/Titanic-Dataset.csv"
data = pd.read_csv(url)


In [3]:

# Configurar pandas para mostrar 2 casas decimais
pd.set_option('display.precision', 2)

# =============================================
# 1. ANÁLISE EXPLORATÓRIA APRIMORADA
# =============================================
print("\n=== ANÁLISE EXPLORATÓRIA COMPLETA ===")

# 1.1. Análise de Cabines
print("\n--- Análise das Cabines por Classe ---")
data['Cabin_Letter'] = data['Cabin'].str[0].fillna('Unknown')
print("\nCabines registradas por classe:")
print(data.groupby('Pclass')['Cabin'].count())
print("\nLetras de cabine por classe:")
print(data.groupby(['Pclass', 'Cabin_Letter']).size().unstack(fill_value=0))

# 1.2. Análise de Títulos
print("\n--- Análise de Títulos ---")
data['Title'] = data['Name'].str.extract(r',\s([A-Za-z]+)\.', expand=False)
common_titles = ['Mr', 'Mrs', 'Miss', 'Master']
data['Title'] = data['Title'].apply(lambda x: x if x in common_titles else 'Rare')
print("\nDistribuição de títulos:")
print(data['Title'].value_counts())

# 1.3. Análise Familiar
print("\n--- Análise Familiar ---")
data['Family_Size'] = data['SibSp'] + data['Parch'] + 1
data['Is_Alone'] = (data['Family_Size'] == 1).astype(int)
print("\nSobrevivência por tamanho de família:")
print(data.groupby('Family_Size')['Survived'].mean().round(2))



=== ANÁLISE EXPLORATÓRIA COMPLETA ===

--- Análise das Cabines por Classe ---

Cabines registradas por classe:
Pclass
1    176
2     16
3     12
Name: Cabin, dtype: int64

Letras de cabine por classe:
Cabin_Letter   A   B   C   D   E  F  G  T  Unknown
Pclass                                            
1             15  47  59  29  25  0  0  1       40
2              0   0   0   4   4  8  0  0      168
3              0   0   0   0   3  5  4  0      479

--- Análise de Títulos ---

Distribuição de títulos:
Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       27
Name: count, dtype: int64

--- Análise Familiar ---

Sobrevivência por tamanho de família:
Family_Size
1     0.30
2     0.55
3     0.58
4     0.72
5     0.20
6     0.14
7     0.33
8     0.00
11    0.00
Name: Survived, dtype: float64


In [4]:
# =============================================
# 3. APLICAÇÃO E VERIFICAÇÃO
# =============================================
# Aplicar pré-processamento
X_enhanced, y_enhanced = prepare_enhanced_data(data.copy())

# Verificar dados finais
print("\nDados aprimorados para modelagem (5 primeiras linhas):")
print(X_enhanced.head())

# Mostrar taxa de sobrevivência por novas features
print("\nTaxa de sobrevivência por título:")
print(data.groupby('Title')['Survived'].mean().round(2))


Valores faltantes após limpeza:
Age                  0
SibSp                0
Parch                0
Fare                 0
Has_Cabin            0
Family_Size          0
Is_Alone             0
Sex_male             0
Embarked_Q           0
Embarked_S           0
Pclass_2             0
Pclass_3             0
Title_Miss           0
Title_Mr             0
Title_Mrs            0
Title_Rare           0
Age_Group_Teen       0
Age_Group_Adult      0
Age_Group_Elderly    0
Fare_Group_Q2        0
Fare_Group_Q3        0
Fare_Group_Q4        0
dtype: int64

Dados aprimorados para modelagem (5 primeiras linhas):
    Age  SibSp  Parch  ...  Fare_Group_Q2  Fare_Group_Q3  Fare_Group_Q4
0  22.0      1      0  ...          False          False          False
1  38.0      1      0  ...          False          False           True
2  26.0      0      0  ...           True          False          False
3  35.0      1      0  ...          False          False           True
4  35.0      0      0  ...      

In [None]:
#treinar o modelo  com dados reais 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt

def prepare_enhanced_data(df):
    # 2.1. Tratar dados faltantes
    df['Age'] = df.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # 2.2. Feature Engineering
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['Is_Alone'] = (df['Family_Size'] == 1).astype(int)
    
    # 2.3. Discretização
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 12, 18, 60, 100], 
                            labels=['Child', 'Teen', 'Adult', 'Elderly'])
    df['Fare_Group'] = pd.qcut(df['Fare'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
    
    # 2.4. Tratar outliers
    fare_upper_limit = df['Fare'].quantile(0.95)
    df['Fare'] = np.where(df['Fare'] > fare_upper_limit, fare_upper_limit, df['Fare'])
    
    # 2.5. Converter variáveis categóricas
    df = pd.get_dummies(df, columns=[
        'Sex', 'Embarked', 'Pclass', 'Title', 'Age_Group', 'Fare_Group'
    ], drop_first=True)
    
    # 2.6. Selecionar features
    features = [
        'Age', 'SibSp', 'Parch', 'Fare', 'Has_Cabin', 'Family_Size', 'Is_Alone',
        'Sex_male', 'Embarked_Q', 'Embarked_S', 
        'Pclass_2', 'Pclass_3',
        'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare',
        'Age_Group_Teen', 'Age_Group_Adult', 'Age_Group_Elderly',
        'Fare_Group_Q2', 'Fare_Group_Q3', 'Fare_Group_Q4'
    ]
    
    # 2.7. Verificar dados faltantes
    print("\nValores faltantes após limpeza:")
    print(df[features].isnull().sum())
    
    return df[features], df['Survived']

# Dividimos os dados em [treino (70%), validação (15%) e teste (15%)]
X_train, X_temp, y_train, y_temp = train_test_split(
    X_enhanced, y_enhanced, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# --- Estratégias Anti-Overfitting ---
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
# Hiperparâmetros para RandomForest
params = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt', 'log2']
}

# Modelo com GridSearchCV
model = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

# Treinar com dados balanceados
model.fit(X_train_resampled, y_train_resampled)

# Exibir os melhores hiperparâmetros
print("\nMelhores hiperparâmetros:", model.best_params_)


# --- Avaliação no Conjunto de Validação ---
y_val_pred = model.predict(X_val)
print("\nRelatório no conjunto de VALIDAÇÃO:")
print(classification_report(y_val, y_val_pred))

# --- Avaliação Final no Teste ---
y_test_pred = model.predict(X_test)
print("\nRelatório no conjunto de TESTE:")
print(classification_report(y_test, y_test_pred))

# Matriz de Confusão
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=["Não Sobreviveu", "Sobreviveu"]
)# Aplicar SMOTE para balancear as classes no conjunto de treino
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

disp.plot(cmap="Blues", values_format="d")
plt.title("Matriz de Confusão (Teste) - Titanic")
plt.show()

# --- Curva de Aprendizado para Diagnóstico ---
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    model.best_estimator_,
    X_train,
    y_train,
    cv=5,
    scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10)
)
# Utilizando pld para gráficos para matriz de confusão e cuva de aprendizado
plt.figure()
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Treino")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Validação")
plt.xlabel("Tamanho do Conjunto de Treino")
plt.ylabel("Acurácia")
plt.title("Curva de Aprendizado")
plt.legend()
plt.grid()
plt.show()

ValueError: 
All the 120 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\pepeh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\pepeh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\pepeh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\pepeh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 562, in _fit
    self._validate_steps()
  File "C:\Users\pepeh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 339, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=42)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't
