In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Data Overview

## Data Cleaning

### Drop Columns
- `Name`
- `Ticket`

### Convert Columns
- `Sex`: change to boolean (`male=True`, `female=False`)

### Feature Engineering
- `Cabin`: create `has_cabin` column (0 if missing, 1 if present)
- Drop original `Cabin` column

### One-Hot Encoding
- `Embarked`: create dummy variables for `C`, `Q`, `S`

### Fill Missing Float Values
- `Age`: fill missing values with median
- `Fare`: fill missing values with median


In [2]:
# -------------------------------
# 1. Load & Preprocess Data
# -------------------------------

def load_data(path):
    return pd.read_csv(path)


def clean_data(df):
    """Basic cleaning used in Titanic Kaggle baseline."""
    df = df.copy()

    # Extract Title
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    rare_titles = df["Title"].value_counts()[df["Title"].value_counts() < 10].index
    df["Title"] = df["Title"].replace(rare_titles, "Rare")

    # Fill missing Age using median by Title
    df["Age"] = df.groupby("Title")["Age"].transform(lambda x: x.fillna(x.median()))

    # Fill Embarked
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

    # Fill Fare
    if "Fare" in df.columns:
        df["Fare"] = df["Fare"].fillna(df["Fare"].median())

    # Cabin â†’ HasCabin
    df["HasCabin"] = df["Cabin"].notna().astype(int)

    # FamilySize
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    return df

## Correlation Check

In [3]:
# -------------------------------
# 2. Feature Selection
# -------------------------------

def get_feature_target(df, target="Survived"):
    features = [
        "Pclass", "Sex", "Age", "Fare", "Embarked",
        "FamilySize", "HasCabin"
    ]
    X = df[features]
    y = df[target] if target in df.columns else None
    return X, y

## Model Introduction

We will use the following models:  
- **Logistic Regression**  
- **Random Forest**  
- **XGBoost**

### Hyperparameter Tuning
We will perform **parameter grid search** to find the best model settings for optimal performance.


In [4]:
# -------------------------------
# 3. Model Builder
# -------------------------------

def build_models():
    lr = LogisticRegression(max_iter=200)
    rf = RandomForestClassifier(n_estimators=400, random_state=42)
    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric='logloss',
        random_state=42,
    )

    voting = VotingClassifier(
        estimators=[("lr", lr), ("rf", rf), ("xgb", xgb)],
        voting="soft"
    )
    return lr, rf, xgb, voting

In [5]:
# -------------------------------
# 4. Build Preprocessing Pipeline
# -------------------------------

def build_pipeline(model):
    """Automatically handles numeric + categorical preprocessing."""
    numeric = ["Age", "Fare", "FamilySize"]
    categorical = ["Pclass", "Sex", "Embarked", "HasCabin"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), numeric),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot",  __import__("sklearn.preprocessing").preprocessing.OneHotEncoder(handle_unknown="ignore"))
            ]), categorical),
        ]
    )

    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    return pipeline

In [6]:
# -------------------------------
# 5. Train Model
# -------------------------------

def train_model(X, y, model):
    pipeline = build_pipeline(model)
    pipeline.fit(X, y)
    return pipeline


# -------------------------------
# 6. Evaluate Model
# -------------------------------

def evaluate(model, X, y):
    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    return acc


# -------------------------------
# 7. Predict Test Data
# -------------------------------

def predict(model, test_df):
    X_test, _ = get_feature_target(test_df, target=None)
    return model.predict(X_test)

# Output

In [8]:
df = load_data("train.csv")
df = clean_data(df)

X, y = get_feature_target(df)

lr, rf, xgb, voting = build_models()
model = train_model(X, y, voting)

print("Train accuracy:", evaluate(model, X, y))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train accuracy: 0.9259259259259259


In [9]:
test = load_data("test.csv")
test = clean_data(test)
preds = predict(model, test)

pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": preds}).to_csv("submission.csv", index=False)