## Initialize data preparation

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
train = pd.read_csv('train.csv')

Setting index to PassengerID for ease

In [3]:
df = train.set_index('PassengerId')
#Custom feature #1 - total family size
df['FamilySize'] = df['SibSp'] + df['Parch']
#Custom feature #2 - deck identifier
df['Deck'] = df['Cabin'].str.extract(r'([A-Z])').fillna('None')
#Custom feature #3 - encoded sex (pseudo, may not be required)
df['SexEncode'] = np.nan
df.loc[df['Sex'].str.contains('fe'), 'SexEncode'] = 2

Extracting title from name

In [4]:
title = df['Name'].str.extract(r',\s+([a-zA-Z]\w*\s*\w*.)')
#Custom feature #4 - title (can be combined and restricted to Mr, Mrs, Master, Miss as per [this](https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/) feature engineering guide)
df['Title'] = title
titles = df['Title'].unique()

Replacing missing ages with approximate values based on title (this one is applicable for most males, need to think of something similar for females)

In [5]:
df.loc[df['Title'].str.contains('Master') & df['Age'].isna(), 'Age'] = random.randint(1,12)
df.loc[df['Title'].str.contains('Mr') & df['Age'].isna(), 'Age'] = random.randint(13,70)
# df.loc[df['Title'].str.contains('Miss') & (df['FamilySize']==0), 'Age'] = random.randint(13,70)

## Initialize model

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

In [7]:
cols_to_use = ['Pclass', 'SexEncode', 'Age', 'FamilySize', 'Deck', 'Title', 'SibSp', 'Parch']
cols_to_encode = ['Deck', 'Title']

In [8]:
x_tr = df[cols_to_use]
y_tr = df['Survived']

In [9]:
encoder = ce.OneHotEncoder(cols=cols_to_encode)
df_encoded = encoder.fit_transform(x_tr, y_tr)

In [10]:
x_tr_n, x_val, y_tr_n, y_val = train_test_split(df_encoded, y_tr, test_size=0.25, random_state=42)

In [11]:
model = xgb.XGBClassifier()
model.set_params(n_estimators=100, max_depth=5)
model.fit(x_tr_n, y_tr_n)

In [12]:
df_score = model.score(x_tr_n, y_tr_n)

In [13]:
df_score

0.9251497005988024

## Preparing test data for running obtained model

In [14]:
test = pd.read_csv('test.csv')

Performing same feature engineering and encoding as done for train data

In [15]:
df_t = test.set_index('PassengerId')
#Custom feature #1 - total family size
df_t['FamilySize'] = df_t['SibSp'] + df_t['Parch']
#Custom feature #2 - deck identifier
df_t['Deck'] = df_t['Cabin'].str.extract(r'([A-Z])').fillna('None')
#Custom feature #3 - encoded sex (pseudo, may not be required)
df_t['SexEncode'] = np.nan
df_t.loc[df_t['Sex'].str.contains('fe'), 'SexEncode'] = 2

In [16]:
title = df_t['Name'].str.extract(r',\s+([a-zA-Z]\w*\s*\w*.)')
#Custom feature #4 - title (can be combined and restricted to Mr, Mrs, Master, Miss as per [this](https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/) feature engineering guide)
df_t['Title'] = title
titles = df_t['Title'].unique()

In [17]:
df_t.loc[df_t['Title'].str.contains('Master') & df_t['Age'].isna(), 'Age'] = random.randint(1,12)
df_t.loc[df_t['Title'].str.contains('Mr') & df_t['Age'].isna(), 'Age'] = random.randint(13,70)
# df_t.loc[df_t['Title'].str.contains('Miss') & (df_t['FamilySize']==0), 'Age'] = random.randint(13,70)

In [18]:
x_test = df_t[cols_to_use]

In [19]:
df_t_encoded = encoder.transform(x_test)

In [20]:
prediction = model.predict(df_t_encoded)

In [21]:
survived = prediction[prediction>0]

In [22]:
len(survived)

153

Preparing the output in the required format for submission

In [23]:
results_df = pd.DataFrame({
    'PassengerID': test['PassengerId'],
    'Survived': prediction
})

In [24]:
results_df.to_csv('results.csv', index=False)