#Demonstrating How to Save and Load Models 

##Load Necessary Libraries 

In [1]:
import psycopg2
import sys  
sys.path.append('..')

import numpy as np
import pandas as pd

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

reload(sys)
sys.setdefaultencoding('utf8')

from src.exploration import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

##Connect to Redshift 

In [2]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

##Get Data From Verlander and Build a Random Forest 

In [3]:
verlander_df = get_pitcher_df_for_modeling(cur, pitcher_id = 434378)

In [4]:
modeling_data = split_test_train(verlander_df, '2012-06-01')

In [5]:
col_subset = [u'b',
 u's',
 u'on_1b',
 u'on_2b',
 u'on_3b',
 u'o',
 u'home_wins',
 u'home_loss',
 u'away_wins',
 u'away_loss',
 u'stand_L',
 u'stand_R',
 u'p_throws_L',
 u'p_throws_R',
 u'Not_Fastball_pb_prior',
 u'Not_Fastball_pbs_prior',
 u'Fastball_pb_prior',
 u'Fastball_pbs_prior',
 u'Not_Fastball_pc_prior',
 u'Not_Fastball_pcs_prior',
 u'Fastball_pc_prior',
 u'Fastball_pcs_prior',
 u'Not_Fastball_pg_prior',
 u'Not_Fastball_pgs_prior',
 u'Fastball_pg_prior',
 u'Fastball_pgs_prior',
 u'last_pitch_type_Fastball',
 u'last_pitch_type_Not_Fastball',
 u'last_pitch_type_not_available',
 u'second_last_pitch_type_Fastball',
 u'second_last_pitch_type_Not_Fastball',
 u'second_last_pitch_type_not_available',
 u'third_last_pitch_type_Fastball',
 u'third_last_pitch_type_Not_Fastball',
 u'third_last_pitch_type_not_available',
 u'prev_pitches_mean_start_speed',
 u'prev_pitches_mean_end_speed',
 u'prev_pitches_mean_break_y',
 u'prev_pitches_mean_break_angle',
 u'prev_pitches_mean_break_length',
 u'ingame_pitch_count',
 u'season',
 u'season_pitch_count']

In [6]:
#Write function to subset the data to columns of interest
def subset_data(modeling_dict, cols_of_interest):
    new_dict = modeling_dict.copy()
    new_dict['train_data'] = new_dict['train_data'][cols_of_interest]
    new_dict['test_data'] = new_dict['test_data'][cols_of_interest]
    return new_dict

In [7]:
modeling_data = subset_data(modeling_data, col_subset)

In [8]:
#Create the classifier and get an overall accuracy score for kicks
rf = RandomForestClassifier(n_estimators = 50, max_depth = 3, min_samples_leaf = 5, min_samples_split = 5, max_features = None)
rf.fit(modeling_data['train_data'], modeling_data['train_targets'])
predictions = rf.predict(modeling_data['test_data'])
accuracy_score(modeling_data['test_targets'], predictions)

0.62479021817309999

##Saving the Model (rf) That We Just Created 

###The function 'save_model' lives in utils.py. Use ?save_model to see the docstring explaining the how the function (hopefully) works 

In [9]:
?save_model

### The following will save the model 'rf' to the folder 'models/rf_test' and write a log of the event to 'models/record_keeping.csv'

In [10]:
save_model(model = rf, model_name = 'rf_test')

##Loading a Model Back Into Your Environment 

###I'm going to demonstrate two ways to load the model back into your environment. The first uses a simple function that I wrote to search the "record_keeping.csv" file for the model name. The second demonstrates how to load a model if you know the name of the serialized filename.

Method 1: Using the record_keeping.csv to load the model by it's name (rf_test)

In [11]:
#check out the docstring
?load_model

In [12]:
#load the model
rf_test = load_model('rf_test')

In [13]:
#Demonstrating that the original and saved models are identical (or at least make the same predictions)
all(rf_test.predict(modeling_data['test_data']) == rf.predict(modeling_data['test_data']))

True

Method 2: Knowing your model's pickled filename (in this case 'models/rf_test/rf_test.pickle'

In [14]:
#import the method needed to de-serialize a model
from sklearn.externals import joblib

#Load the model, back but with a different name
rf_test2 = joblib.load('models/rf_test/rf_test.pickle')

In [15]:
#Demonstrating that the original and saved models are identical (or at least make the same predictions)
all(rf_test2.predict(modeling_data['test_data']) == rf.predict(modeling_data['test_data']))

True