In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# models and algorithms
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load the Dataset
train_df = pd.read_csv('data/train/train.csv')
test_df = pd.read_csv('data/test/test.csv')
combine_df = [train_df, test_df]

# Step 2: Feature Engineering
# Extract titles from names
def extract_title(name):
    return name.split(', ')[1].split('.')[0]

for dataset in combine_df:
    dataset['Title'] = dataset['Name'].apply(extract_title)
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1  # Initialize to being alone
    dataset.loc[dataset['FamilySize'] > 1, 'IsAlone'] = 0  # Fixing chained assignment warning
    dataset['CabinKnown'] = dataset['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

# Fill missing Age values by grouping by Title
age_fill_values = train_df.groupby('Title')['Age'].median()
for dataset in combine_df:
    dataset['Age'] = dataset.apply(lambda row: age_fill_values[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)

# Fill missing Embarked values with the most common port
most_common_port = train_df['Embarked'].mode()[0]
for dataset in combine_df:
    dataset['Embarked'] = dataset['Embarked'].fillna(most_common_port)

# Fill missing Fare values with the median of Fare
for dataset in combine_df:
    dataset['Fare'] = dataset['Fare'].fillna(train_df['Fare'].median())

# Step 3: Encoding categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_features = ['Sex', 'Embarked', 'Title']

# Fit encoder on combined data to handle any category differences
encoder.fit(pd.concat([train_df[categorical_features], test_df[categorical_features]], axis=0))

encoded_train = pd.DataFrame(encoder.transform(train_df[categorical_features]),
                             columns=encoder.get_feature_names_out(categorical_features))
encoded_test = pd.DataFrame(encoder.transform(test_df[categorical_features]),
                            columns=encoder.get_feature_names_out(categorical_features))

# Drop unused features and concatenate encoded features
drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked', 'Title']
train_df = train_df.drop(drop_columns, axis=1)
test_df = test_df.drop(drop_columns, axis=1)

train_df = pd.concat([train_df, encoded_train], axis=1)
test_df = pd.concat([test_df, encoded_test], axis=1)

# Step 4: Model Training and Hyperparameter Tuning
# Define features and labels
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

# Define the models
logreg = LogisticRegression(max_iter=200)
random_forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
svc = SVC(probability=True)

# Hyperparameter tuning for SVM
param_grid = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly', 'linear']}
grid_search_svc = GridSearchCV(svc, param_grid, cv=5)
grid_search_svc.fit(X_train, y_train)
svc_best = grid_search_svc.best_estimator_

# Ensemble Voting Classifier
voting_clf = VotingClassifier(estimators=[('rf', random_forest), ('svc', svc_best), ('xgb', xgb_clf)], voting='soft')

# Cross-validation
kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
scores = cross_val_score(voting_clf, X_train, y_train, cv=kfold)
print(f'Cross-Validation Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})')

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Step 5: Predictions
X_test = test_df
predictions = voting_clf.predict(X_test)

# Save predictions to submission file
submission = pd.DataFrame({'PassengerId': test_df.index + 892, 'Survived': predictions})
submission.to_csv('/mnt/data/submission.csv', index=False)

print("Model training and predictions complete. Submission file created.")
