In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


# Load the data
titanic_data = pd.read_csv('./titanic_train.csv')
test_data = pd.read_csv('./titanic_test.csv')

# Feature Engineering: Add a new feature "FamilySize" combining "SibSp" and "Parch"
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Drop irrelevant columns
titanic_data = titanic_data.drop(columns=['Cabin', 'Name', 'Ticket'], axis=1)
test_data = test_data.drop(columns=['Cabin', 'Name', 'Ticket'], axis=1)

# Handle missing values
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

# Convert categorical columns using one-hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])

# Prepare data for training
X = titanic_data.drop(columns=['PassengerId', 'Survived'], axis=1)
Y = titanic_data['Survived']

# Split the data for training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Hyperparameter tuning for XGBoost using GridSearchCV
xgb_params = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [5, 7, 10]
}

xgb_model = XGBClassifier(random_state=2)
grid_search_xgb = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train, Y_train)

best_xgb_model = grid_search_xgb.best_estimator_

# Hyperparameter tuning for Random Forest using GridSearchCV
rf_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 7, 10],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestClassifier(random_state=2)
grid_search_rf = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, Y_train)

best_rf_model = grid_search_rf.best_estimator_

# Hyperparameter tuning for Gradient Boosting using GridSearchCV
gb_params = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [5, 7, 10]
}

gb_model = GradientBoostingClassifier(random_state=2)
grid_search_gb = GridSearchCV(gb_model, gb_params, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, Y_train)

best_gb_model = grid_search_gb.best_estimator_

# Hyperparameter tuning for Support Vector Machine using GridSearchCV
svm_params = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly']
}

svm_model = SVC(random_state=2)
grid_search_svm = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, Y_train)

best_svm_model = grid_search_svm.best_estimator_

# Create an ensemble of models
ensemble_models = [best_xgb_model, best_rf_model, best_gb_model, best_svm_model]

# Make predictions for test data using each model
test_predictions_ensemble = np.array([model.predict(X_test) for model in ensemble_models])
test_predictions_majority = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=test_predictions_ensemble)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(Y_test, test_predictions_majority)
print('Accuracy score of the ensemble model: ', ensemble_accuracy)

# Make predictions for test data using each model
test_predictions = np.array([model.predict(test_data.drop(columns=['PassengerId'])) for model in ensemble_models])
test_predictions_majority = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=test_predictions)

# Create a submission dataframe
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions_majority})

# Save the submission to a CSV file
submission.to_csv('titanic_submission.csv', index=False)
