In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder

  from pandas.core.computation.check import NUMEXPR_INSTALLED


# Load & Clean Data

This section handles the initial preparation of the Titanic dataset, including loading the data and performing essential cleaning and feature engineering. Each step is chosen with reasoning to improve data quality and model performance:

1. **Load Data**  

2. **Clean Data & Feature Engineering**  
   - **Extract Titles**: Titles (e.g., Mr, Mrs, Miss) are extracted from passenger names to capture social status, which correlates with survival. Rare titles are grouped as "Rare" to avoid sparse categories.  
   
   - **Impute Missing Ages**: Age is strongly correlated with `Sex`, `Pclass`, and `Title`. We fill missing values using the median within these groups, with additional fallbacks for robustness. This preserves meaningful patterns rather than using a global median.  
   
   - **Fill Missing Embarked Values**: Only two missing values exist. They are filled with the mode to maintain consistency without biasing the dataset.  
   
   - **Cabin Features**: The first letter of the cabin is used to capture deck-level information. Missing cabins are labeled `"NoCabin"` to retain the distinction between passengers with and without cabin assignments.  
   
   - **Fill Fare Values**: Fares recorded as 0 are replaced with the median fare grouped by `CabinDeck` and `Embarked`, because fare is correlated with location and cabin level. This avoids distorting the data with invalid 0 values.  
   
   - **Family Size**: Created as `SibSp + Parch + 1`. Family presence can influence survival probability, capturing the effect of traveling alone vs. with family.

This preprocessing ensures a clean, meaningful dataset ready for modeling while retaining important patterns and relationships in the data.


In [3]:
def load_data(path):
    return pd.read_csv(path)


def clean_data(df):
    df = df.copy()

    # Extract title
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    rare_titles = df["Title"].value_counts()[df["Title"].value_counts() < 10].index
    df["Title"] = df["Title"].replace(rare_titles, "Rare")

    # Fill missing Age using median by Sex, Pclass, Title + 2 fallbacks - 20% of missing, age correalated w/ sex/class/title
    df["Age"] = df.groupby(["Sex", "Pclass","Title"])["Age"].transform(lambda x: x.fillna(x.median()))
    df["Age"] = df.groupby(["Sex","Pclass"])["Age"].transform(lambda x: x.fillna(x.median()))
    df["Age"] = df.groupby(["Title"])["Age"].transform(lambda x: x.fillna(x.median()))
    
    # Fill Embarked - only 2 missing value
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    
    # Cabin â†’ HasCabin
    df["HasCabin"] = df["Cabin"].notna().astype(int)
    df["CabinDeck"] = df["Cabin"].str[0].fillna("NoCabin")
    
    # Fill Fare
    df["Fare"] = df.groupby(["CabinDeck", "Embarked"])["Fare"].transform(lambda x: x.mask(x==0, x.median()))

    # FamilySize
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["FarePerPerson"] = df["Fare"] / df["FamilySize"]
    
    # Age group
    bins = [0, 12, 18, 35, 60, 120]
    labels = ["Child", "Teen", "Adult", "MidAge", "Senior"]
    df["AgeGroup"] = pd.cut(df["Age"], bins=bins, labels=labels)
    
    # Interaction
    df["Sex_Pclass"] = df["Sex"] + "_" + df["Pclass"].astype(str)
    
    return df

# Base Model Builder

In [4]:
def build_base_models():
    lr = LogisticRegression(max_iter=500, C=1.0, class_weight='balanced', solver='liblinear', random_state=42)
    rf = RandomForestClassifier(random_state=42, class_weight='balanced')
    xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)
    voting = VotingClassifier(
        estimators=[("lr", lr), ("rf", rf), ("xgb", xgb)],
        voting="soft"
    )
    return lr, rf, xgb, voting

# Build Preprocessing Pipeline

In [5]:
def get_pipeline(numeric_features, categorical_features, model):
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    return pipeline

def optimize_voting(df):
    numeric_features = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
    categorical_features = ['Sex', 'Pclass', 'Embarked', 'Title', 'CabinDeck', 'AgeGroup', 'Sex_Pclass']
    
    _, xgb, lgbm, voting = build_base_models()
    pipeline = get_pipeline(numeric_features, categorical_features, voting)
    
    param_grid = {
        'classifier__xgb__n_estimators': [200, 300, 400],
        'classifier__xgb__learning_rate': [0.05, 0.1],
        'classifier__xgb__max_depth': [3,4,5],
        'classifier__lgbm__n_estimators': [300, 400, 500],
        'classifier__lgbm__learning_rate': [0.05, 0.1],
        'classifier__lgbm__max_depth': [3,4,5]
    }
    
    grid = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    X = df[numeric_features + categorical_features]
    y = df['Survived']
    
    grid.fit(X, y)
    print("Best CV Accuracy:", grid.best_score_)
    print("Best Params:", grid.best_params_)
    
    return grid.best_estimator_

# Train & Evaluate & Predict

In [6]:
def train_model(X, y, model):
    pipeline = build_pipeline(model)
    pipeline.fit(X, y)
    return pipeline


def evaluate(model, X, y):
    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    return acc

def predict(model, test_df):
    X_test, _ = get_feature_target(test_df, target=None)
    return model.predict(X_test)

# ---------
# Output
# ---------

In [17]:
df = load_data("train.csv")
df = clean_data(df)

numeric_features = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
# numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Sex', 'Pclass', 'Embarked', 'Title', 'CabinDeck', 'AgeGroup', 'Sex_Pclass']
# categorical_features = ['Sex', 'Pclass', 'Embarked', 'CabinDeck']

X = df[numeric_features + categorical_features]
y = df['Survived']

lr, xgb, lgbm, voting = build_base_models()  
pipeline = get_pipeline(numeric_features, categorical_features, voting)

pipeline.fit(X, y)

def evaluate(model, X, y):
    return model.score(X, y)

print("Train accuracy:", evaluate(pipeline, X, y))

Train accuracy: 0.9629629629629629


Parameters: { "use_label_encoder" } are not used.



In [20]:
test_df = load_data("test.csv")
test_df = clean_data(test_df)

X_test = test_df[numeric_features + categorical_features]

y_pred = pipeline.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": y_pred
})

submission.to_csv("submission.csv", index=False)
print("Saved predictions to submission.csv")

Saved predictions to submission.csv
