In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Set seeds for reproducibility
np.random.seed(42)

print("Setup complete!")

Setup complete!


## 1. Load Data

### **1.1 Overview**
* `PassengerId` is the unique id of the row and it doesn't have any effect on target
* `Survived` is the target variable we are trying to predict (**0** or **1**):
    - **1 = Survived**
    - **0 = Not Survived**
* `Pclass` (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has **3** unique values (**1**,  **2 **or **3**):
    - **1 = Upper Class**
    - **2 = Middle Class**
    - **3 = Lower Class**
* `Name`, `Sex` and `Age` are self-explanatory
* `SibSp` is the total number of the passengers' siblings and spouse
* `Parch` is the total number of the passengers' parents and children
* `Ticket` is the ticket number of the passenger
* `Fare` is the passenger fare
* `Cabin` is the cabin number of the passenger
* `Embarked` is port of embarkation and it is a categorical feature which has **3** unique values (**C**, **Q** or **S**):
    - **C = Cherbourg**
    - **Q = Queenstown**
    - **S = Southampton**

In [42]:
# Load data from Titanic dataset

In [43]:
# Print shapes of the train and test dataframes

Get more information abaout dataset columns.

Analyse Pclass Feature.

In [44]:
# Get the number of people per class and precentage of how many people survived for each class

In [45]:
# Plot relationship between class and survival

Analyse Sex Feature.

In [46]:
# Get the number of people per gender and precentage of how many people survived for each gender

In [47]:
# Plot relationship between gender and survival

Analyse Age Feature.

In [48]:
# Plot the distribution of the Age column

In [49]:
# Plot the distribution of the Age column for each class

In [50]:
# Get the number of people per SibSp and precentage of how many people survived for each SibSp value

In [51]:
# Get the number of people per Parch and precentage of how many people survived for each Parch value

In [52]:
# Plot distribution of Fare column

In [53]:
# Plot distribution of Fare column for each class (survived vs not survived)

In [54]:
# Value count for Cabin column

In [55]:
# Get the number of people per embarked place and precentage of how many people survived for each embarked place

In [56]:
# Plot relationship between embarked place and survival

## 2. Preprocessing

In this step we select the features that we will use in our model. Beside that we need to impute missing values and encode categorical features. Also some additional feature engineering can be done (merging columns, creating new features using existing ones, etc.).

In [57]:
# Drop columns PassengerId, Name and Ticket

In [58]:
# Show dataset info

In [59]:
# Get FamilySize by adding SibSp, Parch and 1 (for the person itself)

# Add the feature if the person is alone or not

# Drop SibSp and Parch columns

# Create CabinBool feature (1 if not null, 0 otherwise)

# Drop Cabin column

# Check the new columns

In [60]:
# Plot relationship between Alone column and survival

In [61]:
# Plot relationship between CabinBool column and survival

Convert categorical features to numerical features

In [62]:
# Convert categorical features to numerical features

# One-hot encode Embarked feature

# Check the new columns

In [63]:
# Check columns with null values

In [64]:
# Check columns with null values

In [65]:
# Create new column AgeMissing (1 if Age is null, 0 otherwise)

# Impute Age column with mean value

# Impute Fare column with mean value

In [66]:
# Normalize Age and Fare columns

# Check the new columns

In [67]:
# Calculate correlation matrix

# Plot heatmap of correlation matrix

In [68]:
# Define function to calculate MI scores

# Calculate MI scores

In [69]:
# Plot MI scores

## 3. Train Model

First we need to do the following:
1. Dividing the data into features and labels.
2. Splitting the data into training and testing sets. Look at [this](https://illustrated-machine-learning.github.io/#/machine-learning-engineering/introduction#training-and-holdout-sets) for more information.

In [70]:
# Split data into features and target

# Split data into train and validation sets

# Check shapes


Define the training function

In [71]:
# Define training method
def train_model(model, x, y, model_name="", conf="", verbose=True): 
    # fit the model
    
    # make predictions
    
    # get accuracy of predictions
    accuracy = None
    if verbose:
        print(f"Model - {conf}")
        print(f"Train Accuracy: {accuracy}")

    return accuracy

Define methods for test and visualization of confusion matrix.

In [72]:
# Visualize confusion matrix
def visualize_confusion_martix(y, y_pred, confusion_matrix):
    cm = pd.DataFrame(confusion_matrix, index=y.unique(), columns=y.unique())
    # plot using seaborn
    sns.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Define testing method
    def test_model(model, x, y, model_name="", conf="", verbose=True):
        # make predictions
    
        # get accuracy of predictions
        accuracy = None
        # get confusion matrix
        
        # get precision, recall and f1 scores
        precision = None
        recall = None
        f1 = None
        if verbose:
            print(f"Model - {conf}")
            print(f"Test Accuracy: {accuracy}")
            visualize_confusion_martix(y, y_pred, cm)
            print(f"Precision: {precision}")
            print(f"Recall: {recall}")
            print(f"F1 Score: {f1}")

        return accuracy, precision, recall, f1

Train Linear Regression model

In [73]:
# Create Logistic regression model

# Train Logistic regression model

# Test Logistic regression model


Train KNN model.

In [74]:
# Create KNN model

# Train KNN model

# Test KNN model

Train Decision Tree model.

In [75]:
# Create Decision Tree model

# Train Decision Tree model

# Test Decision Tree model


Train Random Forest model.

In [76]:
# Create Random Forest model

# Train Random Forest model

# Test Random Forest model

## 4. K-fold Cross Validation

In order to have more stable testing results we can use K-fold cross validation. Look at [this](https://illustrated-machine-learning.github.io/#/machine-learning/ml-sampling-resampling#resampling) for more information.

In [77]:
# Create K fold cross validation method
def k_fold_cross_validation(model, X, y, k_folds=10, verbose=True):
    train_scores = []
    test_scores = []

    # YOUR CODE HERE

    mean_train_acc = np.mean(train_scores)
    mean_test_acc = np.mean([score[0] for score in test_scores])
    mean_test_precision = np.mean([score[1] for score in test_scores])
    mean_test_recall = np.mean([score[2] for score in test_scores])
    mean_test_f1 = np.mean([score[3] for score in test_scores])
    if verbose:
        print(f"Mean Train Accuracy: {mean_train_acc}")
        test_scores_df = pd.DataFrame(test_scores, columns=["Accuracy", "Precision", "Recall", "F1 Score"])
        print(f"Mean Test Accuracy: {mean_test_acc}")
        print(f"Mean Test Precision: {mean_test_precision}")
        print(f"Mean Test Recall: {mean_test_recall}")
        print(f"Mean Test F1 Score: {mean_test_f1}")
        print(test_scores_df)

    return mean_train_acc, mean_test_acc, mean_test_precision, mean_test_recall, mean_test_f1 


In [78]:
# Test K fold cross validation method on Random Forest model


## 5. Hyperparameter Tuning

Choose the best hyperparameters for the model. For information about hyperparameter tuning look at [this](https://illustrated-machine-learning.github.io/#/machine-learning/tuning).

In [79]:
# Fine-tune random forest model
max_depth = [5, 10, 30]
num_estimators = [50, 200, 500]
min_samples_split = [2, 5, 10]

best_model = None
best_score = 0
best_params = None

# YOUR CODE HERE



In [80]:
print(f"Best Model - {best_params}")
print(f"Best Score - {best_score}")

Best Model - None
Best Score - 0


## 6. Inference

Run the best trainined model on the test data.