# Titanic Model
<hr>

## Imports

#### Import `pandas` and `numpy`  for Data Manipulation

In [None]:
import pandas as pd
import numpy as np

<br>

#### import `matplotlib` and `seaborn` for data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-muted')

<br>

#### Import `SkLearn` functions to model Data

In [None]:
# To Split Dataset
from sklearn.model_selection import train_test_split


# Import Model
from sklearn.ensemble import GradientBoostingClassifier


# Model Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

<br>

#### Import `joblib` to save finished model

In [None]:
import joblib

<br>
<br>
<br>
<br>
<hr>

## Load Data

In [None]:
df = pd.read_csv('./data/titanic.csv')

In [None]:
df.head()

In [None]:
# df.iloc[:1].drop(['Survived'], axis=1)

<br>
<br>
<br>
<br>

# 1. Explatory Data Analysis
<hr>

### ASIDE: matplotlib.pyplot

#### Built-in Docs: `?`

In [None]:
# np.random.randint?

#### Generating random data

In [None]:
a = np.random.randint(low=0, high=10, size=10)
print(a)

#### Generating structured data

In [None]:
# Data
x = np.arange(10)
y = x**2

# print(x)
# print(y)

#### Generic plot

In [None]:
# Figure
fig = plt.figure()

# Axis
ax = fig.gca()

# Plot
ax.plot(x,y)

# Show it on screen.
plt.show()

<br>

## Women and Children?

### `Gender` vs. `Survived`

In [None]:
# Place the table
fig = plt.figure(figsize=(8,6))

# Put sheet on table
ax = fig.gca()

# Draw plot on sheet
sns.countplot(df['Gender'], hue=df['Survived'], ax=ax)

# Output the plot to the screen
plt.show()

<br>

### `Age` and `Gender` VS. `Survived`

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.gca()
sns.violinplot("Gender", "Age", hue = "Survived", data = df, split = True)

plt.show()

<br>
<br>
<br>
<br>

# 2. Feature Engineering and Preprocessing
<hr>

## Where You travelling Alone ?

#### `FamilySize`

In [None]:
df['FamilySize'] = df['SibSp'] + df['ParCh'] + 1

#### How does `FamilySize` impact `Survived`

In [None]:
# Place the table
fig = plt.figure(figsize=(8,6))

# Put sheet on table
ax = fig.gca()

# Draw plot on sheet
sns.catplot(x='FamilySize', y='Survived', data=df, kind="point", ci=None, ax=ax)

# Output the plot to the screen
plt.close(2)
plt.show()

<br>

#### `IsAlone`

In [None]:
df['IsAlone'] = 0

In [None]:
# df.head()

In [None]:
df.loc[df['FamilySize'] == 1, "IsAlone"] = 1

#### How does `IsAlone` impact `Survived`

In [None]:
# Place the table
fig = plt.figure(figsize=(8,6))

# Put sheet on table
ax = fig.gca()

# Draw plot on sheet
sns.countplot(df['IsAlone'], hue=df['Survived'], ax=ax)

# Output the plot to the screen
plt.show()

<br>

## Drop All Columns except for `Age`, `Gender`, and `IsAlone`

In [None]:
df = df.drop(['PassengerId', 'Pclass', 'Name', 'SibSp',
       'ParCh', 'Ticket', 'Fare', 'Cabin', 'Title','FamilySize', 'Embarked'], axis=1)

<br>

## Label Encoding
Most machine learning models cannot interpret string values directly, we must encode them into numerical values!

### Convert `Gender` into a binary column: `IsFemale`

In [None]:
df['IsFemale'] = df['Gender'].replace(['male','female'],[0,1])

In [None]:
df = df.drop(['Gender'],axis=1)

In [None]:
df.head()

<br>
<br>
<br>
<br>

# 3. Create Model
<hr>

## A. Split Dataset into `train` and `test`
**We split the dataset into two sets:**
* `X_train` and `y_train`: Will be passed into the model to learn the patterns in the data
* `X_test` and `y_test`: Will be used to test the validity of the model's predictions.

<img src='./assets/train_test_split.webp' style="width:100px,height:100px">

In [None]:
features = df.drop(['Survived'], axis=1)
labels = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.90, random_state=100)

### Check Dimensions

In [None]:
print("X_train:", X_train.shape, "y_train:", y_train.shape)

In [None]:
print("X_test:", X_test.shape, "y_test:", y_test.shape)

<br>

## B. Train Model

The parameters that I passed in are called `hyperparameters`, these were "discovered" through a process called cross-validation, which can be applied with the `SciKit_Learn` function `GridSearchCV`

In [None]:
model = GradientBoostingClassifier(learning_rate=0.02, n_estimators=200, max_features=None)

In [None]:
model.fit(X_train.values, y_train.values)

## C. Make Predictions on X_test

In [None]:
predictions = model.predict(X_test)

<br>
<br>
<br>
<br>

# 4. Evaluate Model Performance 

<hr>

### A. Check `accuracy` of `predictions` by comparing to  `y_test`

In [None]:
accuracy_score(y_test, predictions)

### B. Check `confusion_matrix` of `predictions`

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
#     prep work
    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names)
    fig = plt.figure(figsize=figsize)
    ax = fig.gca()
    
#     make Heatmap and set custom tick marks
    heatmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap='Blues')
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    
#     Set plot labels
    ax.set_ylabel('True label', fontsize=fontsize*1.5)
    ax.set_xlabel('Predicted label',fontsize=fontsize*1.5)
    
    return fig

In [None]:
cm = confusion_matrix(y_test,predictions)
labels = ["Perished","Survived"]

In [None]:
_ = print_confusion_matrix(confusion_matrix = cm, class_names=labels)

<br>
<br>
<br>
<br>

# 5. Export Model
<hr>

In [None]:
joblib.dump(model, './models/titanic_grad_boost.joblib') 