# Instalando Bibliotecas

In [1]:
%pip install torch matplotlib pandas seaborn

Note: you may need to restart the kernel to use updated packages.


# Importando bibliotecas

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# --- 1. Limpeza e pré-processamento de dados ---

In [3]:
def preprocess_data(df):
    df = df.copy()

    # Colunas não usadas
    columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']
    df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

    # Converte pra float
    numerical_features = ['Age', 'Fare', 'Parch', 'SibSp']
    for col in numerical_features:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
    
    # Trata valores faltantes
    if 'Age' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age'].median())
    if 'Fare' in df.columns:
        df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    if 'Embarked' in df.columns:
        df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Cria nova feature
    if 'SibSp' in df.columns and 'Parch' in df.columns:
        df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
        df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    
    # One-hot encode 
    cols_to_get_dummies = [col for col in ['Sex', 'Embarked'] if col in df.columns]
    df = pd.get_dummies(df, columns=cols_to_get_dummies, drop_first=True)
    
    return df

# Carrega e pré-processa dados
train_df_raw = pd.read_csv('train.csv')
test_df_raw = pd.read_csv('test.csv')

train_df = preprocess_data(train_df_raw.copy())
test_df = preprocess_data(test_df_raw.copy())

X_train_full = train_df
y_train_full = train_df_raw['Survived']
X_test = test_df

# Confere se os 2 dataframes tem a mesmas features
missing_cols_in_test = set(X_train_full.columns) - set(X_test.columns)
for c in missing_cols_in_test:
    X_test[c] = 0

X_test = X_test[X_train_full.columns]

# Data splitting
np.random.seed(42)
indices = np.random.permutation(len(X_train_full))
split_idx = int(0.8 * len(indices))
train_indices = indices[:split_idx]
val_indices = indices[split_idx:]

X_train = X_train_full.iloc[train_indices].copy()
y_train = y_train_full.iloc[train_indices].copy()
X_val = X_train_full.iloc[val_indices].copy()
y_val = y_train_full.iloc[val_indices].copy()

# StandardScaler
numerical_features_to_scale = ['Age', 'Fare', 'FamilySize']

# Tratamento de erros
for col in numerical_features_to_scale:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype(float)
        X_val[col] = X_val[col].astype(float)
        X_test[col] = X_test[col].astype(float)

train_mean = X_train.loc[:, numerical_features_to_scale].mean()
train_std = X_train.loc[:, numerical_features_to_scale].std()

X_train.loc[:, numerical_features_to_scale] = (X_train.loc[:, numerical_features_to_scale] - train_mean) / train_std
X_val.loc[:, numerical_features_to_scale] = (X_val.loc[:, numerical_features_to_scale] - train_mean) / train_std
X_test.loc[:, numerical_features_to_scale] = (X_test.loc[:, numerical_features_to_scale] - train_mean) / train_std

X_train = X_train.astype(float)
X_val = X_val.astype(float)
X_test = X_test.astype(float)


# --- 2. PyTorch Dataset e DataLoader ---

In [4]:
class TitanicDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        if labels is not None:
            self.labels = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]

# Cria datasets e data loaders
train_dataset = TitanicDataset(X_train, y_train)
val_dataset = TitanicDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


# --- 3. Implementação da MLP ---

In [5]:
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1) # 
        )

    def forward(self, x):
        return self.network(x)


input_size = X_train.shape[1]
model = MLP(input_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Treinamento
num_epochs = 50
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for features, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    train_losses.append(running_loss / len(train_loader))

    # Validação
    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for features, labels in val_loader:
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_running_loss += loss.item()
    
    val_losses.append(val_running_loss / len(val_loader))

    if (epoch + 1) % 50 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_losses[-1]:.4f}, Validation Loss: {val_losses[-1]:.4f}')

Epoch [50/50], Training Loss: 0.3835, Validation Loss: 0.3955


# --- 4. Avaliação e Visualização ---

In [6]:


# Plotagem
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Erro de Treinamento')
plt.plot(val_losses, label='Erro de Validação')
plt.title('Curvas de Erro por Época')
plt.xlabel('Época')
plt.ylabel('Erro (Perda)')
plt.legend()
plt.grid(True)
plt.savefig('loss_curve_pytorch.png')
plt.close()

# Avalia e valida
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
y_pred_val_tensor = torch.sigmoid(model(torch.tensor(X_val.values, dtype=torch.float32)))
y_pred_val_binary = (y_pred_val_tensor.squeeze() > 0.5).int()

# Cálculo manual de matriz de confusão e acurácia
def calculate_metrics(y_true, y_pred):
    true_positive = ((y_true == 1) & (y_pred == 1)).sum().item()
    true_negative = ((y_true == 0) & (y_pred == 0)).sum().item()
    false_positive = ((y_true == 0) & (y_pred == 1)).sum().item()
    false_negative = ((y_true == 1) & (y_pred == 0)).sum().item()
    
    cm = np.array([[true_negative, false_positive], [false_negative, true_positive]])
    accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
    return cm, accuracy

cm, accuracy = calculate_metrics(y_val_tensor, y_pred_val_binary)
print(f"\nAcurácia do modelo no conjunto de validação: {accuracy:.4f}")

# Plota a matriz de confusão
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Não Sobreviveu', 'Sobreviveu'],
            yticklabels=['Não Sobreviveu', 'Sobreviveu'])
plt.title('Matriz de Confusão (Conjunto de Validação)')
plt.xlabel('Predito')
plt.ylabel('Real')
plt.savefig('confusion_matrix_pytorch.png')
plt.close()




Acurácia do modelo no conjunto de validação: 0.8268
