In [None]:
# Alogrithm: Mathematical technique or equation, a framework for a model
# Model: Equation that is formed by using data to find the parameters in the equation of an algorithm

# Machine Learning Model Steps
# 1. Explore and clean the dataset
# 2. Split the dataset into training, validation and testing datasets
# 3. Fit an initial model and evaluate it
# 4. Tune hyperparameters
# 5. Evaluate on the validation dataset
# 6. Select and evaluate the final model on the testing dataset

In [None]:
# Importing Packages
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

# Support Vector Classifier
from sklearn.svm import SVC

from time import time

In [None]:
%matplotlib inline

warnings.filterwarnings('ignore', category = FutureWarning)
#warnings.filterwarnings('ignore', category = DeprecationWarning)

In [None]:
# Reading In The Data
titanic = pd.read_csv('../../Data/TitanicTrainingData.csv')
titanic.head()

In [None]:
# Cleaning Continous Variables
titanic['Age'].fillna(titanic['Age'].mean(), inplace = True)
titanic['FamilyCount'] = titanic['SibSp'] + titanic['Parch']
titanic.drop(['PassengerId', 'SibSp', 'Parch'], axis = 1, inplace = True)

In [None]:
# Cleaning Categorical Variables
titanic['CabinIndicator'] = np.where(titanic['Cabin'].isnull(), 0, 1)
gender_number = {'male': 0, 'female': 1}
titanic['Sex'] = titanic['Sex'].map(gender_number)
titanic.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis = 1, inplace = True)

In [None]:
# Splitting The Dataset
# x: Features
# y: Labels
features = titanic.drop('Survived', axis = 1)
labels = titanic['Survived']
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size = 0.4, random_state = 42)
x_validation, x_test, y_validation, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42)

In [None]:
# Checking that the Titanic dataset was appropriately split
for dataset in [y_train, y_validation, y_test]:
    print(round(len(dataset) / len(labels), 2))

In [None]:
x_train.to_csv('../../Data/TitanicTrainingFeatures.csv', index = False)
x_validation.to_csv('../../Data/TitanicValidationFeatures.csv', index = False)
x_test.to_csv('../../Data/TitanicTestingFeatures.csv', index = False)

y_train.to_csv('../../Data/TitanicTrainingLabels.csv', index = False)
y_validation.to_csv('../../Data/TitanicValidationLabels.csv', index = False)
y_test.to_csv('../../Data/TitanicTestingLabels.csv', index = False)

In [None]:
# Regression: A statistical process for estimating the relationships among variables, 
# often to make a prediction about some outcome

# Logistic Regression: A form of regression where the target variable is binary
# General Guidelines:
# Use When:
# 1. Binary target variable
# 2. Transparency is important or interested in significance of predictors
# 3. Fairly well behaved data
# 4. Need a quick initial benchmark
# Avoid When:
# 1. Continous target variable
# 2. Massive data (rows or columns)
# 3. Unwieldy data
# 4. Performance is the only thing that matters

# It is unnecessary to tweak every possible hyperparameter, only those that are most important or have the 
# greatest impact upon the model

# C Hyperparameter: Regularization parameter in logistic regression that controls how closely the model fits
# to the training data
# C = 1 / lambda, where lambda is the regularization paramater
# lambda -> 0 = C -> Infinity, Low regularization, high complexity, more likely to overfit
# lambda -> Infinity = C -> 0, High regularization, low complexity, more likely to underfit

# Regularization: Technique used to reduce overfitting by discouraging overly complex models in some way
LogisticRegression()

In [None]:
# Methods available within LogisticRegression
# All sklearn models have the same APIs
dir(LogisticRegression)

In [None]:
# For easier to understand results
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
#training_features = x_train
#training_labels = y_train

#training_features = pd.read_csv('../../Data/TitanicTrainingFeatures.csv')
#training_labels = pd.read_csv('../../Data/TitanicTrainingLabels.csv', header = None)

lr = LogisticRegression()

# Dictionary to test various parameters with the logistic regression model
lr_parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# GridSearch carrying out k-fold cross validation, the 'cv' value splitting the training data in 5
lr_cv = GridSearchCV(lr, lr_parameters, cv = 5)
#cv.fit(training_features, training_labels.values.ravel())
lr_cv.fit(x_train, y_train.values.ravel())

print_results(lr_cv)

In [None]:
lr_cv.best_estimator_

In [None]:
# Writing Out A Pickled Model
joblib.dump(lr_cv.best_estimator_, '../../Models/TitanicLogisticRegressionModel.pkl')

In [None]:
# Support Vector Machine (SVM): 
# Classifier that finds an optimal hyperplane that maximized the margin between two classes
# In 2D, the hyperplane is the line that seperates squares and circles on a flat grid, whilst the 
# support vector / margin is a perendicular line from the hyperplane which has the maximum possible distance
# between the two nearest objects of different classes, the 'longer' this line the better
# In 3D these become planes with the same goal

# Kernel Trick (Method):
# Transforms data that is not linearly seperable in n-dimensional space to a higher dimension where it is 
# linearly seperable, a straight line or flat hyperplane is the only tool allowed to be used, no circles
# non straight lines
# If you needed a circle to seperate classes in 2D, in 3D a flat plane could be used

# General Guidelines:
# Use When:
# 1. Best for classification with binary target variable, though can be used on categorical and continous outputs,
# not as useful for regression
# 2. Feature-to-row ratio is high, so many columns but relatively few rows, distinguishing features of SVM
# 3. Very complex relationships
# 4. Many outliers

# Avoid When:
# 1. Feature-to-row ratio is low, many rows but few columns, SVM is slow
# 2. Transparency is important or interested in significance of predictors
# 3. Looking for a quick benchmark model, SVM takes longer to train and output model results
SVC()

In [None]:
dir(SVC)

In [None]:
# C Hyperparamter:
# C -> Infinity, large penalty for misclassification of data, small margin, overfitting
# C -> 0, low penalty for misclassification of data, large margin, underfitting

# Model object
svc = SVC()

svc_parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
    
}

svc_cv = GridSearchCV(svc, svc_parameters, cv = 5)
svc_cv.fit(x_train, y_train.values.ravel())

# 'linear' kernel doing much better than rbf kernel, suggesting the data is linearly seperable
# 'C' value does not seem to impact linear kernal utilizing models at all
print_results(svc_cv)

In [None]:
svc_cv.best_estimator_

In [None]:
joblib.dump(svc_cv.best_estimator_, '../../Models/TitanicSupportVectorMachineModel.pkl')

In [None]:
# Multilayer Perceptron:
# Classic feed-forward artificial neural network, the core component of deep learning
# A connected series of nodes in the form of an directed acyclic graph where each node represents 
# a function or a model 

# Directed Acyclic Graph: Directionality between the nodes and no node will ever be revisited

# Every node is connected to every over node of the layer directly next to it in one direction, neural network
# Input Layer: 4 Nodes, Age, Class, Cabin, Sex
# Hidden Layer: As many nodes as desired
# Output Layer: 2 Nodes, Survived, Perished

# General Guidelines:
# Use When:
# 1. Categorical or continous target variables, great for classifcation and regression
# 2. Very complex relationships or performance is the only thing that matters
# 3. When control over the training process is very important, many hyperparameters available for tuning

# Avoid When:
# 1. Image recognition, time series, multi layer perceptrons is not the same as deep learning
# 2. Transparency is important or interested in significance of predictors
# Like blackboxes, these take in inputs and output predictions, but what happens in between is a mystery
# 3. Need a quick benchmark model, lots of hyperparameters to tune, slow to train (faster than SVM)
# 4. Limited data available, these models thrive when lots of data is available

# Important Hyperparameters
# Hidden Layer: Determines how many hidden layers there will be and how many nodes in each layer

# Activation Function: Dictates the type of nonlinearity that is introduced in the model
# Sigmoid Curve: 
# Hyperbolic Tangent Curve (TanH):
# Rectified Linear Unit (ReLU): Default

# Learning Rate: Facilitates how quickly and whether or not the algorithm will find the optimal solution
# To large a learning rate and the model will never find the optimal value on the loss curve, faster
# To low a learning rate and, since most loss curves have local maxima and minima, it might find a local and not
# an absolute minima on the loss curve, missing the optimal value, longer training time
# MLP has an initial learning rate built into MLP that signifies the starting point for the learning rate

MLPRegressor()

In [None]:
MLPClassifier()

In [None]:
mlp = MLPClassifier()
mlp_parameters = {
    # (Nodes per layer, Layers)
    'hidden_layer_sizes': [(10, ), (50, ), (100, )],
    
    
    'activation': ['relu', 'tanh', 'logistic'],
    
    # How the learning rate changes throughout the optimization process
    # constant: Initial learning rate stays the same throughout
    # invscaling: Slowly decreases learning rate through each step, starting large and becomming smaller over time
    # adaptive: Learning rate is constant as love as the training loss keeps decreasing, otherwise is gets smaller
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

mlp_cv = GridSearchCV(mlp, mlp_parameters, cv = 5)

# 27 sperate models being fit
mlp_cv.fit(x_train, y_train.values.ravel())

# Convergence warnings on maximum iterations, which is how many times the MLPClassifier will allow to find
# an optimal model before it stops trying to optimize
print_results(mlp_cv)

In [None]:
mlp_cv.best_estimator_

In [None]:
joblib.dump(mlp_cv.best_estimator_, '../../Models/TitanicMultilayerPerceptronModel.pkl')

In [None]:
# Random Forest:
# Merges a collection of independent decision trees to get a more accurate and stable prediction
# Is essentially an ensemble method, each tree within the model is built independently and does not know what
# the other trees are doing
# Swiss Armt Knife of models, uses a majority votes for prediction

# General Guidelines:
# Use When:
# 1. Categorical or continous target variables, great for classifcation and regression
# 2. Interested in signficance of predictors, lays out each of their importance
# 3. Need a quick benchmark model, flexible in terms of acceptable data, good results, relatively quick to train
# 4. Deals really well with messy data, such as outliers and missing values

# Avoid When:
# 1. Not the best model if trying to solve a complex, novel problem, 90% vs 100%
# 2. Transparency is important, hard to see details in a model with possible 100s of decision trees
# 3. Quick to train, not the fastest to make predictions
RandomForestClassifier()

In [None]:
RandomForestRegressor()

In [None]:
# 'n_estimators': How many individual decision trees will be built, width of the model
# 'max_depth': How many layer deep a decision tree can keep splitting into, controls the complexity of the model
# and how close it can fit to the training data, otherwise it can potentially split down to every individual record

rf = RandomForestClassifier()
rf_parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

rf_cv = GridSearchCV(rf, rf_parameters, cv = 5)

rf_cv.fit(x_train, y_train.values.ravel())

print_results(rf_cv)

In [None]:
rf_cv.best_estimator_

In [None]:
joblib.dump(rf_cv.best_estimator_, '../../Models/TitanicRandomForestModel.pkl')

In [None]:
# Boosting: Gradient Boosted Trees is the current example, many more
# Ensemble method that aggregates a number of weak models to create one strong model
# A weak model is one that is only slightly better than random guessing
# A strong model is one that is strongly correlated with true classification
# Boosting effectively learns from its mistakes every iteration
# Each model learns from the mistakes of its predecessors, and they are not independent from one another
# Each time a small shallow model completes, its performance is evalauted, and those predictions it got wrong
# are resampled amd overweighted for the next model to attempt and solve, assuming the first model has a gone grasp
# on the points it has already correctly classified

# One of the most powerful and used in machine learning, should be considered for just about any problem

# Since training models are not independent, they are not parallelizable, but the predictions are however
# Slow to train, quick to predict, uses weighted voting for prediction 
# based on how each model performed in training

# General Guidelines:
# Use When:
# 1. Categorical or continous target variables, great for classifcation and regression
# 2. Very flexible, handles many different data type, tends to perform better than Random Forest
# 3. Easy to find significance of individual predictors to find insights and relationships within data
# 4. Prediction time important, very quick to predict

# Avoid When:
# 1. Transparency is important, same issues as Random Forest with many different trees
# 2. Training time is imporantant and compute power limited
# 3. Data is really noisy, as boosting is always trying to fix it's own mistakes and tends to overfit by 
# trying to fit to noise and outliers

GradientBoostingClassifier()

In [None]:
GradientBoostingRegressor()

In [None]:
# 'learning_rate': The actual learning rate as a constant value throughout the model creation process, as 
# opposed Multilayer Perceptrons how the learning rate changes

gb = GradientBoostingClassifier()
gb_parameters = {
    # Uses more but more shallow decision trees than random forest
    # Due to how each model optimizes for the bias variance tradeoff
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

gb_cv = GridSearchCV(gb, gb_parameters, cv = 5)

gb_cv.fit(x_train, y_train.values.ravel())

print_results(gb_cv)

In [None]:
gb_cv.best_estimator_

In [None]:
joblib.dump(gb_cv.best_estimator_, '../../Models/TitanicGradientBoostingModel.pkl')

In [None]:
# Model Considerations:
# Accuracy:
# Training Time: Logistic Regression
# Prediction Time: Gradient Boosting

# Latency:
# How are different sized data sets handled?
# How are complex feature relationships handled?
# How will they handle messy data?

# Reading in the models
models = {}

for model in ['LogisticRegression', 
              'SupportVectorMachine', 
              'MultilayerPerceptron', 
              'RandomForest', 
              'GradientBoosting']:
    models[model] = joblib.load('../../Models/Titanic{}Model.pkl'.format(model))

In [None]:
models

In [None]:
# Model Metrics:
# Accuracy = # predicted correctly / total # of examples
# Precision = # predicted as surviving that actually survived / total # predicted to survive
# Recall = # predicted as surviving that actually survived / total # that actually survived

def evaluate_model(name, model, features, labels):
    start = time()
    
    predictions = model.predict(features)
    
    end = time()
    
    accuracy = round(accuracy_score(labels, predictions), 3)
    precision = round(precision_score(labels, predictions), 3)
    recall = round(recall_score(labels, predictions), 3)
    
    print('{} - Accuracy: {} | Precision: {} | Recall: {} | Latency: {} \n'.format(name, accuracy, precision, 
                                                                                   recall, round(end - start, 3)))

In [None]:
# Evaluating Models On Validation Dataset:

# No Free Lunch Theorem: No single algorithm works best for every problem

# x_validation: Features
# y_validation: Labels

# Typical tradeoff between precision vs. recall
# Model selection based on business use case, like filtering spam (precision) vs. fraud detection (recall)

# Will always get the same results on the same data as we are utilizing, stored, fit and concrete models
# When training models results can be different, but not in this case
for name, model in models.items():
    evaluate_model(name, model, x_validation, y_validation)

In [None]:
# Evaluation Best Model On Testing Dataset:
evaluate_model('RandomForest', models['RandomForest'], x_test, y_test)