# Je suis tous les conseils de GPT

In [43]:
import numpy as np
import pandas as pd
from scipy import stats
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from ydata_profiling import ProfileReport

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer



train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')


In [44]:
train.head(2)
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [45]:
# Manipulation of data

train["Age"] = train.groupby(["Sex", "Pclass"])["Age"].transform(
    lambda x: x.fillna(x.median())
)
test["Age"] = test.groupby(["Sex", "Pclass"])["Age"].transform(
    lambda x: x.fillna(x.median())
)
train['Jeune'] = (train['Age'] < 14).astype(int)
test['Jeune'] = (test['Age'] < 14).astype(int)

train['Vieux'] = (train['Age'] >= 60).astype(int)
test['Vieux'] = (test['Age'] >= 60).astype(int)

# Extraction of titles from names
def extract_title(name):
    title_search = re.search(r',\s*([^\.]*)\.', name)
    if title_search:
        title = title_search.group(1).strip()
        if title in ['Mr', 'Mrs', 'Miss', 'Master']:
            return title
        else:
            return 'Rare'
    return 'Rare'

train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)

mapping = {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}
train['Title'] = train['Title'].map(mapping)
test['Title'] = test['Title'].map(mapping)

# Family size and alone status
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

train['isAlone'] = (train['FamilySize'] == 1).astype(int)
test['isAlone'] = (test['FamilySize'] == 1).astype(int)

train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Fare log transformation
train['Farelog'] = np.log1p(train['Fare'])
test['Farelog'] = np.log1p(test['Fare'])
    
train['Sex'] = train['Sex'].map({'male': -1, 'female': 1})
test['Sex'] = test['Sex'].map({'male': -1, 'female': 1})

train['cabinInitial'] = train['Cabin'].fillna('U').apply(lambda x: x[0] if x != 'U' else 'U')
test['cabinInitial'] = test['Cabin'].fillna('U').apply(lambda x: x[0] if x != 'U' else 'U')

# One-hot encoding of categorical features
train = pd.get_dummies(train, columns=['cabinInitial','Embarked','Pclass','Title'], prefix={'cabinInitial':'Cabin','Embarked':'Embarked','Pclass':'Class','Title':'Title'})
test = pd.get_dummies(test, columns=['cabinInitial','Embarked','Pclass','Title'], prefix={'cabinInitial':'Cabin','Embarked':'Embarked','Pclass':'Class','Title':'Title'})

train = train.drop(columns=['Fare' ,'Ticket','Name','SibSp','Parch','Cabin'], errors='ignore')
test = test.drop(columns=['Fare','Ticket','Name','SibSp','Parch','Cabin'], errors='ignore')


In [46]:
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.9,
    random_state=42
)

# Separate target
y = train["Survived"].astype(int)

# Remove target column before aligning
train_features = train.drop(columns=["Survived", "PassengerId"], errors="ignore")
test_features = test.drop(columns=["PassengerId"], errors="ignore")

# Align only features
X, X_test = train_features.align(test_features, join="left", axis=1, fill_value=0)


# Génération d'un rapport de profilage HTML pour X
profile = ProfileReport(X, title="Profiling Report for X", explorative=True)
profile.to_file("profiling_X.html")

# Génération d'un rapport de profilage HTML pour y
profile = ProfileReport(pd.DataFrame(y), title="Profiling Report for y", explorative=True)
profile.to_file("profiling_y.html")

# Génération d'un rapport de profilage HTML pour X_test
profile = ProfileReport(X_test, title="Profiling Report for X_test", explorative=True)
profile.to_file("profiling_X_test.html")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(gb, X, y, cv=cv, scoring="accuracy")
print("CV mean accuracy:", scores.mean(), "+/-", scores.std())


# --- Train final model ---
gb.fit(X, y)

# --- Predict on test ---
predicted_classes = gb.predict(X_test)

# --- Create submission DataFrame ---
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predicted_classes
})
submission.to_csv("submission.csv", index=False)
print("submission.csv created ✅")



100%|██████████| 27/27 [00:00<?, ?it/s]32 [00:00<00:00, 163.22it/s, Describe variable: Title_4] 
Summarize dataset: 100%|██████████| 45/45 [00:02<00:00, 20.45it/s, Completed]                      
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  6.86it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 194.24it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]/6 [00:00<?, ?it/s, Describe variable: Survived]
Summarize dataset: 100%|██████████| 10/10 [00:00<00:00, 118.13it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [00:00<00:00, 12.17it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 79.04it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 990.62it/s]
100%|██████████| 27/27 [00:00<?, ?it/s]32 [00:00<00:00, 215.78it/s, Describe variable: Title_4] 
Summarize dataset: 100%|██████████| 45/45 [00:01<00:00, 25.24it/s, Completed]                      
Generate report struct

CV mean accuracy: 0.8450881928315862 +/- 0.017113235789162625
submission.csv created ✅
