# Titanic

In [46]:
import numpy as np
import pandas as pd
from scipy import stats
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')

print("\nNombre de NaN par colonne pour chaque DataFrame:")
print("train:\n", train.isna().sum())
print("\ntest:\n", test.isna().sum())
print("\ngender_submission:\n", gender_submission.isna().sum())

print("\nTypes de chaque colonne pour chaque DataFrame:")
print("train:\n", train.dtypes)
print("\ntest:\n", test.dtypes)
print("\ngender_submission:\n", gender_submission.dtypes)



Nombre de NaN par colonne pour chaque DataFrame:
train:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

test:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

gender_submission:
 PassengerId    0
Survived       0
dtype: int64

Nombre de lignes dans chaque DataFrame:
train: 891
test: 418
gender_submission: 418

Types de chaque colonne pour chaque DataFrame:
train:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked 

In [48]:
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Premier test, je supprime juste les collones où je n'ai pas tous les résultats et où ce n'est pas un int facile à manipuler 

In [49]:
# Suppression des colonnes 'Cabin' et 'Age' dans les DataFrames train et test
train["Age"] = train.groupby(["Sex", "Pclass"])["Age"].transform(
    lambda x: x.fillna(x.median())
)
test["Age"] = test.groupby(["Sex", "Pclass"])["Age"].transform(
    lambda x: x.fillna(x.median())
)

train['Jeune'] = (train['Age'] < 14).astype(int)
test['Jeune'] = (test['Age'] < 14).astype(int)

train['Vieux'] = (train['Age'] >= 60).astype(int)
test['Vieux'] = (test['Age'] >= 60).astype(int)

train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Création des colonnes indicatrices pour Pclass
for i in [1, 2, 3]:
    train[f'Classe_{i}'] = (train['Pclass'] == i).astype(int)
    test[f'Classe_{i}'] = (test['Pclass'] == i).astype(int)

train = train.drop(columns=['Cabin','Flare' ,'Embarked','Ticket','Name','Age','Pclass'], errors='ignore')
test = test.drop(columns=['Cabin', 'Flare','Embarked','Ticket','Name','Age','Pclass'], errors='ignore')


print(train['Sex'].value_counts())

train['Sex'] = train['Sex'].map({'male': -1, 'female': 1})
test['Sex'] = test['Sex'].map({'male': -1, 'female': 1})

print("\nNombre de NaN par colonne pour chaque DataFrame:")
print("train:\n", train.isna().sum())
print("\ntest:\n", test.isna().sum())

print("\nTypes de chaque colonne pour chaque DataFrame:")
print("train:\n", train.dtypes)
print("\ntest:\n", test.dtypes)

Sex
male      577
female    314
Name: count, dtype: int64

Nombre de NaN par colonne pour chaque DataFrame:
train:
 PassengerId    0
Survived       0
Sex            0
SibSp          0
Parch          0
Fare           0
Jeune          0
Vieux          0
Classe_1       0
Classe_2       0
Classe_3       0
dtype: int64

test:
 PassengerId    0
Sex            0
SibSp          0
Parch          0
Fare           0
Jeune          0
Vieux          0
Classe_1       0
Classe_2       0
Classe_3       0
dtype: int64

Types de chaque colonne pour chaque DataFrame:
train:
 PassengerId      int64
Survived         int64
Sex              int64
SibSp            int64
Parch            int64
Fare           float64
Jeune            int64
Vieux            int64
Classe_1         int64
Classe_2         int64
Classe_3         int64
dtype: object

test:
 PassengerId      int64
Sex              int64
SibSp            int64
Parch            int64
Fare           float64
Jeune            int64
Vieux            int64
C

In [50]:
y_train = train['Survived'].values
X_train = train.drop(columns=['Survived']).values
X_test = test.values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)  # pour classification
X_test = torch.tensor(X_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [51]:


class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.dropout = nn.Dropout(0.5)
        self.fc3 = nn.Linear(hidden_dim2, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))   # hidden layer
        x = self.dropout(x)        # apply dropout
        x = F.relu(self.fc2(x))   # hidden layer
        x = self.fc3(x)           # logits
        return x
    

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=100, patience=10):
    best_val_loss = np.inf
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(epochs):
        # --- Training ---
        model.train()
        train_loss, train_correct, train_total = 0, 0, 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # compute accuracy
            _, preds = torch.max(outputs, 1)
            train_correct += (preds == y_batch).sum().item()
            train_total += y_batch.size(0)

        train_loss /= len(train_loader)
        train_acc = train_correct / train_total

        # --- Validation ---
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

                # compute accuracy
                _, preds = torch.max(outputs, 1)
                val_correct += (preds == y_batch).sum().item()
                val_total += y_batch.size(0)

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total

        print(f"Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        # --- Early stopping check ---
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
    
    # Restore best model
    if best_model_state:
        model.load_state_dict(best_model_state)
    return model



In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=32, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val, y_val), batch_size=32)

# training
model = Classifier(input_dim=X_train.shape[1], hidden_dim1=128, hidden_dim2=64, output_dim=len(set(y_train.numpy())))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

# Utiliser la fonction train_model avec early stopping
model = train_model(model, train_loader, val_loader, criterion, optimizer, epochs=100, patience=15)

Epoch 1/100000 | Train Loss: 0.7047, Train Acc: 0.4719 | Val Loss: 0.7007, Val Acc: 0.5084
Epoch 2/100000 | Train Loss: 0.7015, Train Acc: 0.4761 | Val Loss: 0.7005, Val Acc: 0.5140
Epoch 3/100000 | Train Loss: 0.7033, Train Acc: 0.4607 | Val Loss: 0.7003, Val Acc: 0.5251
Epoch 4/100000 | Train Loss: 0.7033, Train Acc: 0.4621 | Val Loss: 0.7001, Val Acc: 0.5307
Epoch 5/100000 | Train Loss: 0.7073, Train Acc: 0.4396 | Val Loss: 0.6999, Val Acc: 0.5307
Epoch 6/100000 | Train Loss: 0.7068, Train Acc: 0.4284 | Val Loss: 0.6997, Val Acc: 0.5363
Epoch 7/100000 | Train Loss: 0.7056, Train Acc: 0.4508 | Val Loss: 0.6995, Val Acc: 0.5419
Epoch 8/100000 | Train Loss: 0.7032, Train Acc: 0.4635 | Val Loss: 0.6993, Val Acc: 0.5419
Epoch 9/100000 | Train Loss: 0.7076, Train Acc: 0.4551 | Val Loss: 0.6992, Val Acc: 0.5419
Epoch 10/100000 | Train Loss: 0.7062, Train Acc: 0.4677 | Val Loss: 0.6990, Val Acc: 0.5475
Epoch 11/100000 | Train Loss: 0.7031, Train Acc: 0.4494 | Val Loss: 0.6988, Val Acc: 0.54

In [53]:
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    predicted_classes = predictions.argmax(dim=1)

In [54]:
# Création d'un DataFrame pour la soumission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predicted_classes.numpy()
})

# Sauvegarde dans un fichier CSV
submission.to_csv('submission8.csv', index=False)