In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

In [None]:
# Load datasets

In [2]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# Feature Engineering

In [3]:
train_df['Family_Size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family_Size'] = test_df['SibSp'] + test_df['Parch'] + 1

In [4]:
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
train_df['Family_Size_Grouped'] = train_df['Family_Size'].map(family_map)
test_df['Family_Size_Grouped'] = test_df['Family_Size'].map(family_map)

In [5]:
train_df['Title'] = train_df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0].str.strip()
test_df['Title'] = test_df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0].str.strip()

In [6]:
title_mapping = {
    'Capt': 'Military', 'Col': 'Military', 'Major': 'Military',
    'Jonkheer': 'Noble', 'the Countess': 'Noble', 'Don': 'Noble',
    'Lady': 'Noble', 'Sir': 'Noble', 'Mlle': 'Noble', 'Ms': 'Noble', 'Mme': 'Noble'
}

In [7]:
train_df['Title'] = train_df['Title'].replace(title_mapping)
test_df['Title'] = test_df['Title'].replace(title_mapping)

In [8]:
train_df['Cabin'] = train_df['Cabin'].fillna('U').apply(lambda x: x[0])
test_df['Cabin'] = test_df['Cabin'].fillna('U').apply(lambda x: x[0])

In [None]:
# Handling missing values

In [30]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

In [None]:
# Defining feature columns

In [31]:
ode_cols = ['Family_Size_Grouped', 'Title']
ohe_cols = ['Sex', 'Embarked', 'Cabin']

In [None]:
# Data Preprocessing Pipeline

In [32]:
ordinal_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [12]:
ohe_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [33]:
preprocessor = ColumnTransformer([
    ('ordinal', ordinal_pipeline, ode_cols),
    ('onehot', ohe_pipeline, ohe_cols),
    ('impute_age', SimpleImputer(strategy='mean'), ['Age']),
    ('impute_fare', SimpleImputer(strategy='mean'), ['Fare'])
], remainder='passthrough')

In [None]:
# Model and hyperparameter tuning setup

In [34]:
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [35]:
param_grids = {
    'RandomForest': {
        'classifier__n_estimators': [150, 200, 300],
        'classifier__max_depth': [10, 15, 20],
        'classifier__min_samples_split': [5, 10, 15],
        'classifier__min_samples_leaf': [2, 4, 6],
        'classifier__criterion': ['gini', 'entropy']
    },
    'GradientBoosting': {
        'classifier__n_estimators': [300, 400, 500],
        'classifier__learning_rate': [0.1, 0.3, 0.6],
        'classifier__max_depth': [8, 10, 12],
        'classifier__min_samples_leaf': [50, 100],
        'classifier__max_features': [0.1, 0.3, 0.5]
    }
}

In [None]:
# Prepare the data

In [36]:
X = train_df.drop(['Survived', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1)
y = train_df['Survived']
X_test = test_df.drop(['Name', 'Ticket', 'SibSp', 'Parch'], axis=1)

In [40]:
best_estimators = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    param_grid = param_grids[name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=5), n_jobs=-1)
    grid_search.fit(X, y)
    best_estimators[name] = grid_search.best_estimator_

In [41]:
print(f"Best params for {name}: {grid_search.best_params_}")
print(f"Best score for {name}: {grid_search.best_score_}")

Best params for GradientBoosting: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 12, 'classifier__max_features': 0.1, 'classifier__min_samples_leaf': 50, 'classifier__n_estimators': 400}
Best score for GradientBoosting: 0.8372544096415793


In [None]:
# Ensemble Voting Classifier

In [42]:
voting_clf = VotingClassifier(estimators=[
    ('RandomForest', best_estimators['RandomForest']),
    ('GradientBoosting', best_estimators['GradientBoosting'])
], voting='soft')

voting_clf.fit(X, y)
y_pred = voting_clf.predict(X_test)

In [None]:
# Create submission

In [43]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred
})

In [46]:
submission.to_csv('/kaggle/working/submission_final.csv', index=False)