# Titanic Survivor Prediction

This notebook demonstrates encoding and scaling of features on the Titanic dataset from Stanford University.

## Imports

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from sklearn import tree

## Download the Dataset

Run the following cell to download the CSV file containing the data. Note that in this example we're again downloading the data the "Pythonic" way, rather than using the Terminal command wget as in some of the earlier notebooks.

You can read more about the dataset on it's official homepage from [Stanford University](https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/problem12.html).

In [None]:
data_url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"

response = requests.get(data_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    
    # Save the file to your working directory
    with open("titanic.csv", "wb") as file:
        file.write(response.content)
    print("File downloaded successfully.")
    
else:
    print(f"Failed to download the file. Status code: {response.status_code}")


## Load the Dataset

In [None]:
df = pd.read_csv("titanic.csv")

## Exercise: Categorical Variable Encoding

Define which variables should be handled as categorical.

In [None]:
categorical_features = []

In [None]:
all_dummy_variables = []
for categorical_feature in categorical_features:

    # Perform one-hot encoding using pd.get_dummies
    dummy_variables = pd.get_dummies(df[categorical_feature], prefix=categorical_feature)
    all_dummy_variables.extend(dummy_variables)

    # Append the new one-hot encoded variables to the original DataFrame
    df = pd.concat([df, dummy_variables], axis=1)


## Feature Selection

Define which features to use and what the target variable is.

In [None]:
features = df[all_dummy_variables + ['Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']]
target = df['Survived']

## Training & Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

## Model Training & Evaluation

In [None]:
# Decision Tree:
# - max_depth: maximum depth of decision nodes (default: None)
decision_tree = DecisionTreeClassifier(max_depth=None)

# Random Forest
# - n_estimators: number of individual decision trees used internally by the model (default: 100)
random_forest = RandomForestClassifier(n_estimators=100)

# Logistic Regression:
# - max_iter: maximum number of iterations (default: 100)
logistic_regression = LogisticRegression(max_iter=100)


# K-Nearest Neighbors
# - n_neighbors: number of neighbors to consider (default: 5)
# - weights: weighting of distance to neighbors: 'uniform' or 'distance' (default: 'uniform')
knn = KNeighborsClassifier()

# Support Vector Machine:
support_vector_machine = SVC()

classifiers = [
    decision_tree,
    random_forest,
    logistic_regression,
    knn,
    support_vector_machine
]

model_metrics = []
for classifier in tqdm(classifiers):
    
    # Train the classifier
    start_time = time.time()
    trained_model = classifier.fit(X_train, y_train)
    end_training_time = time.time()
    training_time_elapsed = end_training_time - start_time
    
    # Apply trained classifier to test set
    start_time = time.time()
    predictions = trained_model.predict(X_test)
    prediction_time = time.time()
    prediction_time_elapsed = prediction_time - start_time
    
    # Measure model performance
    score = classifier.score(X_test, y_test)
    
    # Record model metrics
    model_metrics.append({
        "model": trained_model.__class__.__name__,
        "training_time": training_time_elapsed,
        "prediction_time": prediction_time_elapsed,
        "score": score,
    })
    
# Print model metrics table
scores_df = pd.DataFrame(model_metrics)
scores_df

## Exercise: Feature Scaling

Determine which features should be scaled.

In [None]:
columns_to_scale = []

for column_to_scale in columns_to_scale:

    scaler = StandardScaler()
    X_train[column_to_scale] = scaler.fit_transform(X_train[[column_to_scale]])
    X_test[column_to_scale] = scaler.fit_transform(X_test[[column_to_scale]])

### Visualize Density of Scaled Values

In [None]:
# TODO: use .plot.density() to plot scaled values

## Model Training & Evaluation on Encoded + Scaled Data

In [None]:
model_metrics = []
for classifier in tqdm(classifiers):
    
    # Train the classifier
    trained_model = classifier.fit(X_train, y_train)
    
    # Apply trained classifier to test set
    predictions = trained_model.predict(X_test)
    
    # Measure model performance
    score = classifier.score(X_test, y_test)
    
    # Record model metrics
    model_metrics.append({
        "model": trained_model.__class__.__name__,
        "score (with scaling)": score,
    })
    
scaled_scores_df = pd.DataFrame(model_metrics)

## Score Comparison

In [None]:
scores_df = scores_df.rename({'score': 'score (without scaling)'}, axis=1)
pd.merge(scores_df, scaled_scores_df, on='model')

## Decision Tree Interpretation

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=2)
decision_tree = decision_tree.fit(features, target)

tree.plot_tree(decision_tree, feature_names=X_train.columns[:-1], filled=True)
plt.show()