In [40]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
from ydata_profiling import ProfileReport
from usefull_fct import FeatureBuilder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')


In [41]:
num_cols = ["Age", "Farelog", "FamilySize", "isAlone"]
cat_cols = ["Sex", "Embarked", "Pclass", "Title","AgeBin"]

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        # Pass-through numerical features
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"  # drop unused raw columns like Name, Ticket, Cabin, etc.
)

In [42]:
gb = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

clf = Pipeline(steps=[
    ("build", FeatureBuilder()),   # builds features (Age impute by group, etc.)
    ("prep", preprocessor),        # encodes categoricals and passes nums
    ("model", gb)                  # classifier
])

In [43]:
# Split features/target directly from RAW train DataFrame
y = train["Survived"].astype(int)
X_raw = train.drop(columns=["Survived"])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_raw, y, cv=cv, scoring="accuracy")
print("CV mean accuracy:", scores.mean(), "+/-", scores.std())


CV mean accuracy: 0.8350062421972535 +/- 0.016024361092099523


In [44]:
# Fit on full training data
clf.fit(X_raw, y)

# Predict on RAW test DataFrame (Pipeline will build features + encode)
X_test_raw = test.copy()
pred = clf.predict(X_test_raw)

# Build submission
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": pred
})
submission.to_csv("submission_pipeline.csv", index=False)
print("submission_pipeline.csv created")

submission_pipeline.csv created
