# Titanic - Machine Learning from Disaster

<a href="#general_inspection">1. General inspection</a>

<a href="#visualizations">2. Visualizations</a>

<a href="#missing_data">3. Missing data</a>

<a href="#categorical_features">4. Categorical features</a>

<a href="#creating_features">5. Creating/modifying features</a>

<a href="#models">6. Models and predictions (Logistic Regression, Random Forest, Support Vector, XGBoost, Neural Network, Voting Classifier)</a>

<a href="#results">7. Submit results</a>

In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier
import tensorflow as tf
from keras.models import Sequential 
from keras import layers

In [None]:
%matplotlib inline
sns.set_style('darkgrid')

In [None]:
data_path = '../input/titanic'

print(os.listdir(data_path))


- Train and Test sets.
- gender_submission.csv is an example of a submission file for the competition.

In [None]:
train = pd.read_csv(os.path.join(data_path, 'train.csv'), index_col='PassengerId')
test  = pd.read_csv(os.path.join(data_path, 'test.csv'), index_col='PassengerId')

In [None]:
# Create a copy of the original datasets
train_original = train.copy()
test_original  = test.copy()

## <a id="general_inspection">1. General inspection</a>

### Target

- **Survived** (*int*): wheter the passenger survived or not --> 0 = No, 1 = Yes.

### Features

- **Pclass** (*int*): ticket class --> 1 = first class, 2 = second class, 3 = third class.

- **Name** (*str*).

- **Sex** (*str*): male or female.

- **Age** (*float*): age in years (fractional if less than 1; if the age is estimated, it is in the form of xx.5).

- **SibSp** (*int*): number of siblings and spouse that travelled with the passenger.

- **Parch** (*int*): number of childs and parents that travelled with the passenger.

- **Ticket** (*str*): ticket number (**NOT USED**).

- **Fare** (*float*): passenger fare.

- **Cabin** (*str*): cabin number (**NOT USED**).

- **Embarked** (*str*): port of embarkation --> C = Cherbourg, Q = Queenstown, S = Southampton.

In [None]:
target = 'Survived'

In [None]:
# Drop the Ticket, Cabin and Name columns
train = train.drop(['Ticket', 'Cabin'], axis=1)
test  = test.drop(['Ticket', 'Cabin'], axis=1)

In [None]:
features = list(test.columns)

print(f'- Number of rows (train) = {len(train)}')
print(f'- Number of rows (test) = {len(test)}')
print(f'- Number of used features = {len(features)} --> {features}')

## <a id="visualizations">2. Visualizations</a>

In [None]:
def annotate_barplot_with_percentages(ax, fontsize=12):
    """Function to annotate barplots with percentages."""

    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height() / len(train))
        x = p.get_x() + p.get_width() / 2
        y = p.get_height() / 2
        ax.annotate(percentage, (x, y), ha='center', fontsize=fontsize, fontweight='bold')

### Simple plots

In [None]:
f, axes = plt.subplots(2, 3, figsize=(16, 10))
# Survivors plot
ax_1 = sns.countplot(x=target, data=train, palette='Blues', ax=axes[0, 0])
ax_1.set_title('Survival', fontsize=14, fontweight='bold')
ax_1.set_xticklabels(['No', 'Yes'])
annotate_barplot_with_percentages(ax_1)
# Pclass plot
ax_2 = sns.countplot(x='Pclass', data=train, palette='Blues', ax=axes[0, 1])
ax_2.set_title('Pclass', fontsize=14, fontweight='bold')
annotate_barplot_with_percentages(ax_2)
# Sex plot
ax_3 = sns.countplot(x='Sex', data=train, palette='Blues', ax=axes[0, 2])
ax_3.set_title('Sex', fontsize=14, fontweight='bold')
annotate_barplot_with_percentages(ax_3)
# Age distribution plot
bin_size = 10
ax_4 = sns.distplot(train['Age'], bins=int(train['Age'].max() / bin_size), ax=axes[1, 0])
ax_4.set_title('Age distribution', fontsize=14, fontweight='bold')
# Fare distribution plot
bin_size = 10
ax_5 = sns.distplot(train['Fare'], bins=int(train['Fare'].max() / bin_size), ax=axes[1, 1])
ax_5.set_title('Fare distribution', fontsize=14, fontweight='bold')
# Embarked plot
ax_6 = sns.countplot(x='Embarked', data=train, palette='Blues', ax=axes[1, 2])
ax_6.set_title('Embarked', fontsize=14, fontweight='bold')
annotate_barplot_with_percentages(ax_6)
plt.show()

### Survival plots

In [None]:
f, axes = plt.subplots(1, 3, figsize=(16, 5))
# Sex hue plot
ax_1 = sns.countplot(x=target, data=train, hue='Sex', palette='Blues', ax=axes[0])
ax_1.set_title('Survival by Sex', fontsize=14, fontweight='bold')
ax_1.set_xticklabels(['No', 'Yes'])
annotate_barplot_with_percentages(ax_1, fontsize=10)
# Pclass hue plot
ax_2 = sns.countplot(x=target, data=train, hue='Pclass', palette='Blues', ax=axes[1])
ax_2.set_title('Survival by Pclass', fontsize=14, fontweight='bold')
ax_2.set_xticklabels(['No', 'Yes'])
annotate_barplot_with_percentages(ax_2, fontsize=10)
# Embarked hue plot
ax_3 = sns.countplot(x=target, data=train, hue='Embarked', palette='Blues', ax=axes[2])
ax_3.set_title('Survival by Embarked', fontsize=14, fontweight='bold')
ax_3.set_xticklabels(['No', 'Yes'])
annotate_barplot_with_percentages(ax_3, fontsize=10)
plt.show()

## <a id="missing_data">3. Missing data</a>

In [None]:
# Identify features with missing values (NaN)

print('===== TRAIN =====')
for feature_with_nans in train.loc[:, train.isna().sum() > 0]:
    number_of_nans = train[feature_with_nans].isna().sum()
    print(f'- {feature_with_nans} --> {number_of_nans} ({round(100 * number_of_nans / len(train), 2)} %)')

print('\n===== TEST =====')
for feature_with_nans in test.loc[:, test.isna().sum() > 0]:
    number_of_nans = test[feature_with_nans].isna().sum()
    print(f'- {feature_with_nans} --> {number_of_nans} ({round(100 * number_of_nans / len(test), 2)} %)')

In [None]:
# Impute Age feature with their median value
simple_imputer = SimpleImputer(strategy='median')
train['Age']   = simple_imputer.fit_transform(train[['Age']])
test['Age']    = simple_imputer.fit_transform(test[['Age']])

# Impute Fare feature with their mean value
simple_imputer = SimpleImputer(strategy='mean')
test['Fare']   = simple_imputer.fit_transform(test[['Fare']])

# Impute Embarked feature with the most frequent value
simple_imputer    = SimpleImputer(strategy='most_frequent')
train['Embarked'] = simple_imputer.fit_transform(train[['Embarked']])

In [None]:
# Check that there are no more missing values
if (train.isna().sum().sum() == 0) and (test.isna().sum().sum() == 0):
    print('Great! Now there are no missing values.')
else:
    print('Ooops! There are still some missing values.')

## <a id="categorical_features">4. Categorical features</a>

In [None]:
# Encode Sex and Embarked features with one hot encoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

train_encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(train[['Sex']]).toarray(), columns=['Female', 'Male'], index=train.index)
test_encoded_df  = pd.DataFrame(one_hot_encoder.fit_transform(test[['Sex']]).toarray(), columns=['Female', 'Male'], index=test.index)

train = train.join(train_encoded_df)
test  = test.join(test_encoded_df)

train_encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(train[['Embarked']]).toarray(), columns=['C', 'Q', 'S'], index=train.index)
test_encoded_df  = pd.DataFrame(one_hot_encoder.fit_transform(test[['Embarked']]).toarray(), columns=['C', 'Q', 'S'], index=test.index)

train = train.join(train_encoded_df)
test  = test.join(test_encoded_df)

train = train.drop(['Sex', 'Male', 'Embarked'], axis=1)
test  = test.drop(['Sex', 'Male', 'Embarked'], axis=1)

## <a id="creating_features">5. Creating/modifying features</a>

### Name feature

In [None]:
# Generate the Title feature based on the Name column
train['Title'] = train['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test['Title']  = test['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Normalized titles
normalized_titles = {
                     "Capt":         "Officer",
                     "Col":          "Officer",
                     "Major":        "Officer",
                     "Jonkheer":     "Royalty",
                     "Don":          "Royalty",
                     "Sir" :         "Royalty",
                     "Dr":           "Officer",
                     "Rev":          "Officer",
                     "the Countess": "Royalty",
                     "Dona":         "Royalty",
                     "Mme":          "Mrs",
                     "Mlle":         "Mrs",
                     "Ms":           "Mrs",
                     "Mr" :          "Mr",
                     "Mrs" :         "Mrs",
                     "Miss" :        "Mrs",
                     "Master" :      "Master",
                     "Lady" :        "Royalty"
                    }

# Map the normalized titles to the current titles
train['Title'] = train['Title'].map(normalized_titles)
test['Title']  = test['Title'].map(normalized_titles)

In [None]:
# Encode Ttile feature with one hot encoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

columns = ['Title_1', 'Title_2', 'Title_3', 'Title_4', 'Title_5']

train_encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(train[['Title']]).toarray(), columns=columns, index=train.index)
test_encoded_df  = pd.DataFrame(one_hot_encoder.fit_transform(test[['Title']]).toarray(), columns=columns, index=test.index)

train = train.join(train_encoded_df)
test  = test.join(test_encoded_df)

train = train.drop(['Name', 'Title'], axis=1)
test  = test.drop(['Name', 'Title'], axis=1)

### SibSp and Parch features

In [None]:
# Create a feature using SibSp and Parch that indicates if the person was travelling with any relative
train['WithFamily'] = (train['SibSp'] + train['Parch'] > 0).astype(int)
test['WithFamily']  = (test['SibSp'] + test['Parch'] > 0).astype(int)

train = train.drop(['SibSp', 'Parch'], axis=1)
test  = test.drop(['SibSp', 'Parch'], axis=1)

# Encode WithFamily feature with one hot encoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

train_encoded_df = pd.DataFrame(one_hot_encoder.fit_transform(train[['WithFamily']]).toarray(), columns=['Alone', 'Family'], index=train.index)
test_encoded_df  = pd.DataFrame(one_hot_encoder.fit_transform(test[['WithFamily']]).toarray(), columns=['Alone', 'Family'], index=test.index)

train = train.join(train_encoded_df)
test  = test.join(test_encoded_df)

train = train.drop(['WithFamily', 'Family'], axis=1)
test  = test.drop(['WithFamily', 'Family'], axis=1)

# Set target column as the last column
train = pd.concat([train.loc[:, train.columns != target], train[target]], axis=1)

### Age feature

In [None]:
def categorize_age(age):
    """Function to categorize the Age feature."""

    if age < 16:
        category = 1
    elif age < 30:
        category = 2
    elif age < 50:
        category = 3
    elif age < 80:
        category = 4
    else:
        category = 5

    return category

In [None]:
# Categorize the Age feature in intervals (0-16 -> 1, 16-30 -> 2, ...)
train['Age'] = train['Age'].apply(categorize_age)
test['Age']  = test['Age'].apply(categorize_age)

### Fare feature

In [None]:
def categorize_fare(fare):
    """Function to categorize the Fare feature."""

    if fare < 10:
        category = 1
    elif fare < 40:
        category = 2
    elif fare < 100:
        category = 3
    else:
        category = 4

    return category

In [None]:
# Categorize the Fare feature
train['Fare'] = train['Fare'].apply(categorize_fare)
test['Fare']  = test['Fare'].apply(categorize_fare)

## <a id="models">6. Models and predictions</a>

In [None]:
X_complete_train = train.loc[:, train.columns != target]
y_complete_train = train[target]

# Divide train dataset into train and validation
X_train, X_valid, y_train, y_valid = train_test_split(train.loc[:, train.columns != target], 
                                                      train[target], 
                                                      test_size=0.33)
X_test = test

### Logistic Regression Classifier

In [None]:
# Create a base model
lr_classifier = LogisticRegression()

# Train the model
lr_classifier.fit(X_train, y_train)

In [None]:
train_pred = lr_classifier.predict(X_train)
valid_pred = lr_classifier.predict(X_valid)

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)

print(f'Accuracy with Logistic Regression Classifier (train) = {train_accuracy}')
print(f'Accuracy with Logistic Regression Classifier (valid) = {valid_accuracy}')

### Random Forest Classifier

In [None]:
rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train, y_train)

In [None]:
train_pred = rf_classifier.predict(X_train)
valid_pred = rf_classifier.predict(X_valid)

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)

print(f'Accuracy with Random Forest Classifier (train) = {train_accuracy}')
print(f'Accuracy with Random Forest Classifier (valid) = {valid_accuracy}')

### SV Classifier

In [None]:
sv_classifier = SVC(probability=True)

sv_classifier.fit(X_train, y_train)

In [None]:
train_pred = sv_classifier.predict(X_train)
valid_pred = sv_classifier.predict(X_valid)

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)

print(f'Accuracy with Supported Vector Classifier (train) = {train_accuracy}')
print(f'Accuracy with Supported Vector Classifier (valid) = {valid_accuracy}')

### XGBoost Classifier

In [None]:
xgb_classifier = XGBClassifier()

xgb_classifier.fit(X_train, y_train)

In [None]:
train_pred = xgb_classifier.predict(X_train)
valid_pred = xgb_classifier.predict(X_valid)

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)

print(f'Accuracy with XGBoost Classifier (train) = {train_accuracy}')
print(f'Accuracy with XGBoost Classifier (valid) = {valid_accuracy}')

### Neural Network Classifier

In [None]:
def build_nn():
    """Function that generates the neural network classifier."""

    nn_classifier = Sequential([
        layers.Dense(units=9, activation='relu', kernel_initializer='uniform', input_shape=[13]), 
        layers.Dense(units=9, activation='relu', kernel_initializer='uniform'), 
        layers.Dense(units=9, activation='relu', kernel_initializer='uniform'), 
        layers.Dense(units=1, activation='sigmoid', kernel_initializer='uniform')    
    ])

    nn_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return nn_classifier

In [None]:
nn_classifier = build_nn()
nn_classifier.fit(X_train, y_train, batch_size=32, epochs=200, verbose=False)

In [None]:
train_scores = nn_classifier.predict(X_train)
valid_scores = nn_classifier.predict(X_valid)

train_pred = (train_scores > 0.5).astype(int).reshape(X_train.shape[0])
valid_pred = (valid_scores > 0.5).astype(int).reshape(X_valid.shape[0])

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)

print(f'Accuracy with Neural Network (train) = {train_accuracy}')
print(f'Accuracy with Neural Network (valid) = {valid_accuracy}')

In [None]:
keras_nn_classifier = tf.keras.wrappers.scikit_learn.KerasClassifier(build_nn, epochs=200, verbose=False)
keras_nn_classifier._estimator_type = "classifier"

### Generate a voting model with the previous classifiers (lr, rf, svc, xgb, nn)

We create a voting model with all of the previous classifiers.
In the voting process, the probabilities of each model prediction is taken into account (voting='soft').

In [None]:
# Create our voting classifier with soft method (takes into account the probabilities of each prediction in the voting process)
voting_classifier = VotingClassifier(estimators=[('lr',  lr_classifier), 
                                                 ('rf',  rf_classifier), 
                                                 ('svc', sv_classifier), 
                                                 ('xgb', xgb_classifier), 
                                                 ('nn',  keras_nn_classifier)], 
                                     voting='soft')

In [None]:
voting_classifier.fit(X_train, y_train)

In [None]:
train_pred = voting_classifier.predict(X_train)
valid_pred = voting_classifier.predict(X_valid)

train_accuracy = accuracy_score(y_train, train_pred)
valid_accuracy = accuracy_score(y_valid, valid_pred)

print(f'Accuracy with Voting Classifier (train) = {train_accuracy}')
print(f'Accuracy with Voting Classifier (valid) = {valid_accuracy}')

In [None]:
# Train with all the available data
voting_classifier.fit(X_complete_train, y_complete_train)

In [None]:
# Generate the predictions for the test dataset
test_pred = voting_classifier.predict(X_test)

## <a id="results">7. Submit results</a>

In [None]:
output_df = pd.DataFrame({'PassengerId': X_test.index, 'Survived': test_pred})
output_df.to_csv('rfc_50.csv', index=False)