# Titanic - Machine Learning from Disaster

Predict survival on the Titanic  
Dataset : https://www.kaggle.com/competitions/titanic/data  
submission code : https://www.kaggle.com/code/rajatshah/scikit-learn-ml-from-start-to-finish/notebook

| Variable  | Definition                   | Key                              |
|-----------|------------------------------|----------------------------------|
| survival  | Survival                     | 0 = No, 1 = Yes                  |
| pclass    | Ticket class                 | 1 = 1st, 2 = 2nd, 3 = 3rd        |
| sex       | Sex                          |                                  |
| Age       | Age in years                 |                                  |
| sibsp     | # of siblings / spouses aboard the Titanic |                  |
| parch     | # of parents / children aboard the Titanic |                  |
| ticket    | Ticket number                |                                  |
| fare      | Passenger fare               |                                  |
| cabin     | Cabin number                 |                                  |
| embarked  | Port of Embarkation          | C = Cherbourg, Q = Queenstown, S = Southampton |

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Importing and Visualizing data

In [None]:
data_train = pd.read_csv('data/titanic/train.csv')
data_test = pd.read_csv('data/titanic/test.csv')

display(data_train.sample(5))
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.barplot(x="Embarked", y="Survived", hue="Sex", data=data_train, ax=axes[0])
axes[0].set_title('Survival Rate by Embarkation and Sex')

sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=data_train,
              palette={"male": "blue", "female": "pink"},
              markers=["*", "o"], linestyles=["-", "--"], ax=axes[1])
axes[1].set_title('Survival Rate by Class and Sex')

plt.tight_layout()
plt.show()

## Transforming Features
* Aside from 'Sex', the 'Age' feature is second in importance. To avoid overfitting, grouping people into logical human age groups.  
* Each Cabin starts with a letter. saving it into a feature by slicing it off.  
* Fare is another continuous value that should be simplified. Run data_train.Fare.describe() to get the distribution of the feature, then placed them into quartile bins accordingly.
* Extract information from the 'Name' feature. Rather than use the full name, extract the last name and name prefix (Mr. Mrs. Etc.), then appended them as their own features.
* Lastly, drop useless features. (Ticket and Name)

In [None]:
data_train.Fare.describe()

In [None]:
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df    
    
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

data_train = transform_features(data_train)
data_test = transform_features(data_test)
data_train.head()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.barplot(x="Age", y="Survived", hue="Sex", data=data_train, ax=axes[0])
axes[0].set_title('Survival Rate by Age and Sex')

sns.barplot(x="Cabin", y="Survived", hue="Sex", data=data_train, ax=axes[1])
axes[1].set_title('Survival Rate by Cabin and Sex')

sns.barplot(x="Fare", y="Survived", hue="Sex", data=data_train, ax=axes[2])
axes[2].set_title('Survival Rate by Fare and Sex')

plt.tight_layout()
plt.show()

## Feature Encoding

In [None]:
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Lname', 'NamePrefix']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test
    
data_train, data_test = encode_features(data_train, data_test)
data_train.head()

## Dataset splitting
Splitting into train:test :: 80%:20%   

In [24]:
from sklearn.model_selection import train_test_split

X_all = data_train.drop(['Survived', 'PassengerId'], axis=1)
y_all = data_train['Survived']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

## Random Forest Classifer with grid search 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

## Validate model with KFold 

In [None]:
from sklearn.model_selection import KFold

def run_kfold(clf):
    kf = KFold(n_splits=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf.split(X_all):
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

## Run predictions

In [None]:
ids = data_test['PassengerId']
predictions = clf.predict(data_test.drop('PassengerId', axis=1))


output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
display(output.head(5))
display(output.tail(5))

output['Survived'] = output['Survived'].map({0: 'No', 1: 'Yes'})
plt.figure(figsize=(5, 5))
output['Survived'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightgreen'])
plt.title('Survival Distribution')
plt.ylabel('')
plt.show()