# Je suis tous les conseils de GPT

In [107]:
import numpy as np
import pandas as pd
from scipy import stats
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from ydata_profiling import ProfileReport

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer



train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')


In [108]:
train.head(2)
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [109]:
# Manipulation of data

train["Age"] = train.groupby(["Sex", "Pclass"])["Age"].transform(
    lambda x: x.fillna(x.median())
)
test["Age"] = test.groupby(["Sex", "Pclass"])["Age"].transform(
    lambda x: x.fillna(x.median())
)
train['Jeune'] = (train['Age'] < 14).astype(int)
test['Jeune'] = (test['Age'] < 14).astype(int)

train['Vieux'] = (train['Age'] >= 60).astype(int)
test['Vieux'] = (test['Age'] >= 60).astype(int)

# Extraction of titles from names
def extract_title(name):
    title_search = re.search(r',\s*([^\.]*)\.', name)
    if title_search:
        title = title_search.group(1).strip()
        if title in ['Mr', 'Mrs', 'Miss', 'Master']:
            return title
        else:
            return 'Rare'
    return 'Rare'

train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)


# Family size and alone status
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

train['isAlone'] = (train['FamilySize'] == 1).astype(int)
test['isAlone'] = (test['FamilySize'] == 1).astype(int)

train['Fare'] = train['Fare'].fillna(train['Fare'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Fare log transformation
train['Farelog'] = np.log1p(train['Fare'])
test['Farelog'] = np.log1p(test['Fare'])
    
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

train['cabinInitial'] = train['Cabin'].fillna('U').apply(lambda x: x[0] if (x != 'U') & (x[0] != 'T') else 'U')
test['cabinInitial'] = test['Cabin'].fillna('U').apply(lambda x: x[0] if (x != 'U') & (x[0] != 'T') else 'U')

# One-hot encoding of categorical features
train = pd.get_dummies(train, columns=['cabinInitial','Embarked','Pclass','Title'], prefix={'cabinInitial':'Cabin','Embarked':'Embarked','Pclass':'Class','Title':'Title'})
test = pd.get_dummies(test, columns=['cabinInitial','Embarked','Pclass','Title'], prefix={'cabinInitial':'Cabin','Embarked':'Embarked','Pclass':'Class','Title':'Title'})

train = train.drop(columns=['Fare','Ticket','Name','SibSp','Parch','Cabin'], errors='ignore')
test = test.drop(columns=['Fare','Ticket','Name','SibSp','Parch','Cabin'], errors='ignore')


In [110]:
gb = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

# Separate target
y = train["Survived"].astype(int)

# Remove target column before aligning
train_features = train.drop(columns=["Survived", "PassengerId"], errors="ignore")
test_features = test.drop(columns=["PassengerId","Survived"], errors="ignore")

# Align only features
X, X_test = train_features.align(test_features, join="left", axis=1, fill_value=0)

print(train.shape, test.shape)
print(train_features.columns.difference(test_features.columns))
print('---')
print(X.shape, X_test.shape)
print(X.columns.difference(X_test.columns))
print(X_test.columns.difference(X.columns))


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(gb, X, y, cv=cv, scoring="accuracy")
print("CV mean accuracy:", scores.mean(), "+/-", scores.std())

# --- Train final model ---
gb.fit(X, y)

# --- Predict on test ---
predicted_classes = gb.predict(X_test)

# --- Create submission DataFrame ---
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predicted_classes
})
submission.to_csv("submission_ia_v2.csv", index=False)
print("submission_ia_v2.csv created successfully.")



(891, 28) (418, 27)
Index([], dtype='object')
---
(891, 26) (418, 26)
Index([], dtype='object')
Index([], dtype='object')
CV mean accuracy: 0.833832709113608 +/- 0.029871115178412608
submission_ia_v2.csv created successfully.


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8, random_state=42)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
print("CV mean accuracy:", scores.mean(), "+/-", scores.std())

# --- Train final model ---
model.fit(X, y)

# --- Predict on test ---
predicted_classes = model.predict(X_test)

# --- Create submission DataFrame ---
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predicted_classes
})
submission.to_csv("submission_ia_XGB_v1.csv", index=False)
print("submission_ia_XGB_v1.csv created successfully.")


CV mean accuracy: 0.83270911360799 +/- 0.026589122620672426
submission_ia_XGB_v1.csv created successfully.
