## Two ways to save and load machine learning models:
1. With python's `pickle` module.
2. With the `joblib` module.

In [1]:
# Standard imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Load data

from sklearn.datasets import load_boston

boston = load_boston()

boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = pd.Series(boston['target'])


heart_disease = pd.read_csv('~/sample_project/Data/heart-disease.csv')

In [3]:
# Specific imports 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [8]:
# Data and models from previous notebook

np.random.seed(42)

heart_disease_shuffled = heart_disease.sample(frac=1)

grid_2 = {'n_estimators': [100, 200, 500],
          'max_depth': [None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_split': [6],
          'min_samples_leaf': [1, 2]}


from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# Split into X and y

X = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

# Split into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier

clf = RandomForestClassifier(n_jobs=1)

# Set up GridSearchCV

gs_clf = GridSearchCV(estimator=clf, 
                            param_grid=grid_2,
                            cv=5, 
                            verbose=2)

# Fit the GridSearchCV version of clf

gs_clf.fit(X_train, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=6, 

In [11]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true vs. y_pred labels, on a classification model.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

**Pickle**

In [4]:
import pickle

Save an existing model to file

In [9]:
pickle.dump(gs_clf, open("gs_random_random_forest_1.pkl", 'wb'))

Load a saved model

In [10]:
loaded_pickle_model = pickle.load(open('gs_random_random_forest_1.pkl', 'rb'))

Make some predictions

In [12]:
pickle_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)

Accuracy: 78.69%
Precision: 0.74
recall: 0.82
F1 score: 0.78


{'accuracy': 0.79, 'precision': 0.74, 'recall': 0.82, 'f1': 0.78}

**Joblib**

In [13]:
from joblib import dump, load

Save model to file

In [14]:
dump(gs_clf, filename="gs_random_forest_model_1.joblib")

['gs_random_forest_model_1.joblib']

Import a saved joblib model

In [15]:
loaded_job_model = load(filename="gs_random_forest_model_1.joblib")

Make and evaluate joblib predictions

In [16]:
joblibs_y_preds = loaded_job_model.predict(X_test)

In [17]:
evaluate_preds(y_test, joblibs_y_preds)

Accuracy: 78.69%
Precision: 0.74
recall: 0.82
F1 score: 0.78


{'accuracy': 0.79, 'precision': 0.74, 'recall': 0.82, 'f1': 0.78}