In [25]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [26]:
train = pd.read_csv('Data/titanic/train.csv')
test = pd.read_csv('Data/titanic/test.csv')

Feature engineering

In [27]:
def extract_title(name):
    if pd.isnull(name):
        return 'Unknown'
    return name.split(',')[1].split('.')[0].strip()

In [28]:
train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)

In [29]:
# Consolidate rare titles
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
               'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train['Title'] = train['Title'].replace(['Mlle', 'Ms'], 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train['Title'] = train['Title'].apply(lambda x: 'Rare' if x in rare_titles else x)
test['Title'] = test['Title'].replace(['Mlle', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
test['Title'] = test['Title'].apply(lambda x: 'Rare' if x in rare_titles else x)

In [30]:
# Create a FamilySize feature
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# Create a feature to indicate if passenger had family onboard
train['IsAlone'] = 1*(train['FamilySize'] == 1)
test['IsAlone'] = 1*(test['FamilySize'] == 1)

Data cleaning and Encoding

In [31]:
# Drop columns that likely add little value to our prediction
drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train_features = train.drop(columns=drop_columns + ['Survived'])
test_features = test.drop(columns=drop_columns)

In [32]:
# For categorical features, we'll perform label encoding on:
# 'Sex', 'Embarked', and our engineered 'Title'
categorical_features = ['Sex', 'Embarked', 'Title']

In [33]:
# We will use a simple imputer for numeric features (Age, Fare) as well
numeric_features = ['Age', 'Fare', 'FamilySize', 'SibSp', 'Parch']

In [34]:
# Fill missing numerical values with median (imputation)
for col in numeric_features:
    median_val = train_features[col].median()
    train_features[col] = train_features[col].fillna(median_val)
    if col in test_features.columns:
        test_features[col] = test_features[col].fillna(train_features[col].median())

In [35]:
# For Embarked, fill missing values with mode from train set
for col in ['Embarked']:
    mode_val = train_features[col].mode()[0]
    train_features[col] = train_features[col].fillna(mode_val)
    if col in test_features.columns:
        test_features[col] = test_features[col].fillna(mode_val)


In [36]:
# Label encoding for categorical variables
encoder = LabelEncoder()
for col in categorical_features:
    # Fit on train and then transform both datasets to ensure consistency
    train_features[col] = encoder.fit_transform(train_features[col])
    test_features[col] = encoder.transform(test_features[col])

Prepare training data

In [37]:
X_train = train_features
y_train = train['Survived']
X_test = test_features

In [43]:
# Here we use RandomForestClassifier with a basic grid search for hyperparameters.
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 400],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}


In [44]:
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
print("Best parameters found:", grid_search.best_params_)

# Cross-validation score (for our own reference)
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5)
print("CV accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Best parameters found: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 400}
CV accuracy scores: [0.84357542 0.80898876 0.87078652 0.80898876 0.84269663]
Mean CV accuracy: 0.8350072186303434


In [45]:
predictions = best_rf.predict(X_test)

In [46]:
# Ensure PassengerId order from test data is maintained
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predictions
})

In [47]:
submission.to_csv("submission4.csv", index=False)
print("Submission file 'submission4.csv' created successfully!")

Submission file 'submission4.csv' created successfully!
