In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


In [2]:
# 1- Load the datasets
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
# 2- Data preprocessing

train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)


In [4]:
# Convert categorical variables to numerical
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})


In [5]:
# 3- Feature engineering
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

In [6]:
# New features
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

In [7]:
# Bucketize Age
bins = [0, 12, 18, 30, 50, 80]
labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=bins, labels=labels)
test_data['AgeGroup'] = pd.cut(test_data['Age'], bins=bins, labels=labels)

In [8]:
# Select features
features = ['Pclass', 'Sex', 'AgeGroup', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']
X = pd.get_dummies(train_data[features])
y = train_data['Survived']

In [9]:
# 4- Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# 5- Model training using XGBoost with hyperparameter tuning

xgb_model = XGBClassifier()
params = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
}
grid_search = GridSearchCV(xgb_model, params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [11]:
# Best parameters
best_params = grid_search.best_params_


In [12]:
#  Final model training
final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)


In [13]:
# 6- Model evaluation on the validation set
predictions = final_model.predict(pd.get_dummies(X_val))
accuracy = accuracy_score(y_val, predictions)
print(f'Accuracy on validation set: {accuracy}')

Accuracy on validation set: 0.8156424581005587


In [14]:
# 7- Making predictions on the test set
X_test = pd.get_dummies(test_data[features])
test_predictions = final_model.predict(X_test)

In [15]:
# 8- Creating submission file
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission_xgb_advanced.csv', index=False)