In [88]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from ydata_profiling import ProfileReport
from usefull_fct import FeatureBuilder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')


In [89]:
num_cols = ["Age", "Farelog", "FamilySize", "isAlone"]
cat_cols = ["Sex", "Embarked", "Pclass"]

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        # Pass-through numerical features
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"  # drop unused raw columns like Name, Ticket, Cabin, etc.
)

In [None]:

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.9,
    random_state=42
)

rf = RandomForestClassifier(
    class_weight="balanced",
    n_estimators=300,
    max_depth=6,
    random_state=42
)

lr = LogisticRegression(max_iter=2000, random_state=42)

ensemble = VotingClassifier(
    estimators=[('gb', gb), ('rf', rf), ('lr', lr)],
    voting='soft'
)

clf2 = Pipeline(steps=[
    ("build", FeatureBuilder()),   # custom feature engineering
    ("prep", preprocessor),        # encode + scale
    ("model", ensemble)            # ensemble as final estimator
])

clf = Pipeline(steps=[
    ("build", FeatureBuilder()),   # builds features (Age impute by group, etc.)
    ("prep", preprocessor),        # encodes categoricals and passes nums
    ("model", gb)                  # classifier
])

In [91]:
# Split features/target directly from RAW train DataFrame
y = train["Survived"].astype(int)
X_raw = train.drop(columns=["Survived"])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_raw, y, cv=cv, scoring="accuracy")
print("CV mean accuracy:", scores.mean(), "+/-", scores.std())


CV mean accuracy: 0.8361423220973782 +/- 0.014341439699285231


In [92]:
# Fit on full training data
clf.fit(X_raw, y)

# Predict on RAW test DataFrame (Pipeline will build features + encode)
X_test_raw = test.copy()
pred = clf.predict(X_test_raw)

# Build submission
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": pred
})

submission.to_csv("submission_pipeline.csv", index=False)
print("submission_pipeline.csv created")

submission_pipeline.csv created


In [93]:
# --- Cross-validation ---
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(clf2, train, y, cv=cv, scoring="accuracy")
scores2 = cross_val_score(clf, train, y, cv=cv, scoring="balanced_accuracy")
print("CV mean accuracy:", scores.mean(), "+/-", scores.std())
print("CV mean balanced accuracy:", scores2.mean(), "+/-", scores2.std())

# --- Train final model ---
clf2.fit(train, y)

# --- Predict on test ---
predicted_classes = clf2.predict(X_test_raw)

# --- Create submission ---
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predicted_classes
})
submission.to_csv("submission_voting.csv", index=False)
print("submission_voting.csv created")

CV mean accuracy: 0.8282521847690386 +/- 0.027701408576904177
CV mean balanced accuracy: 0.8163207707325355 +/- 0.02030263222306436
submission_voting.csv created
