Jupyter Notebook, Step 4 - Build Model

Implement your final model
(Optionally) use the entire data set

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split   
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,precision_recall_curve,confusion_matrix 
from sklearn.metrics import (precision_score,accuracy_score,roc_auc_score,roc_curve, 
                             precision_recall_curve,recall_score,make_scorer,auc) 
from scipy.stats import boxcox, skew
import pickle
%matplotlib inline

In [14]:
class Read_pickled_madelon_Datasets(object):
   
    # Initializing object.
    def __init__(self):
       
        # Read pickled data from NB1.
        self.data_dir = '../dataNB1_DB/'    

        self.read_all_data_from_files()
        
    def read_all_data_from_files(self):
        # Read train data.
        self.X_train_data_df_1 = pd.read_pickle(self.data_dir +'X_train_1_df.p')
        self.X_train_data_df_2 = pd.read_pickle(self.data_dir +'X_train_2_df.p')
        self.X_train_data_df_3 = pd.read_pickle(self.data_dir +'X_train_3_df.p')

        # X test data.
        self.X_test_df = pd.read_pickle(self.data_dir +'X_test_df.p')
        
        # X test data.
        self.X_validate_df = pd.read_pickle(self.data_dir +'X_validate_df.p')

        # y train data.
        self.y_train_data_1 = pd.read_pickle(self.data_dir + 'y_train_1.p')
        self.y_train_data_2 = pd.read_pickle(self.data_dir + 'y_train_2.p')
        self.y_train_data_3 = pd.read_pickle(self.data_dir + 'y_train_3.p')
        
        # y test data.
        self.y_test_data_1 = pd.read_pickle(self.data_dir + 'y_test.p')
 
    def get_X_train_data_sets(self):
        return(self.X_train_data_df_1, self.X_train_data_df_2, self.X_train_data_df_3)
    
    def get_X_valid_data(self):
        return(self.X_validate_df)
    

    def get_X_test_data(self):
        return(self.X_test_df)
       
    def get_y_train_data(self):        
        # y data should be a 1D array of labels. It is now read as a dataframe of 1 column with each
        # element as a list of one int.  We want it to be just an int not a list.
        self.y_train_data_1 = np.array([val[0] for val in self.y_train_data_1.values])
        self.y_train_data_2 = np.array([val[0] for val in self.y_train_data_2.values])
        self.y_train_data_3 = np.array([val[0] for val in self.y_train_data_3.values])
        
        return(self.y_train_data_1, self.y_train_data_2, self.y_train_data_3)
    
    def get_y_test_data(self):
        # y data should be a 1D array of labels. It is now read as a dataframe of 1 column with each
        # element as a list of one int.  We want it to be just an int not a list.
        self.y_test = np.array([val[0] for val in self.y_test_data_1.values])
        
        return(self.y_test)

In [15]:
# Read in DB Madelon Data. For this notebook we only need the X and y train data.
read_data = Read_pickled_madelon_Datasets()

In [16]:
# X train data.
X_train_data_df_1, X_train_data_df_2, X_train_data_df_3 = \
                                                        read_data.get_X_train_data_sets()
# y train data.
y_train_1, y_train_2, y_train_3 = read_data.get_y_train_data()

# X train datasets in a list.
X_train_subsets = [X_train_data_df_1, X_train_data_df_2, X_train_data_df_3]

# y train datasets in a list.
y_train_subsets = [y_train_1, y_train_2, y_train_3]

# Read X and y test data.
Xtest = read_data.get_X_test_data()
ytest = read_data.get_y_test_data()

In [17]:
# Read pickled data subsets with only the selected features.

# Read pickled data.
X_train_sel_feats_subsets_df=[]
X_train_sel_feats_subsets_df.append(pd.read_pickle('../dataNB3_DB/X_train_sel_feats_subsets_df1.p'))
X_train_sel_feats_subsets_df.append(pd.read_pickle('../dataNB3_DB/X_train_sel_feats_subsets_df2.p'))
X_train_sel_feats_subsets_df.append(pd.read_pickle('../dataNB3_DB/X_train_sel_feats_subsets_df3.p'))

Xtest_sel_feats = pd.read_pickle('../dataNB3_DB/X_test_sel_feats_df.p')

In [18]:
model_results_df = pd.read_pickle('../dataNB3_DB/FinalModels_GOOD_df.p')

In [19]:
# Gridsearch results from Notebook3. The gridsearch was performed on 3 sets of data.

model_results_df

Unnamed: 0,dataset,model,sc_tr_acc_score,sc_tst_acc_score,sc_precision,sc_recall,pipe_object
0,Set_1,LogisticRegression,0.5985,0.597333,0.58745,0.598639,"Pipeline(steps=[('standardscaler', StandardSca..."
1,Set_1,SVC,0.95375,0.791333,0.782086,0.795918,"Pipeline(steps=[('standardscaler', StandardSca..."
2,Set_1,DecisionTree,0.73125,0.674,0.663564,0.678912,"Pipeline(steps=[('standardscaler', StandardSca..."
3,Set_1,KNN,1.0,0.792667,0.784946,0.794558,"Pipeline(steps=[('standardscaler', StandardSca..."
4,Set_2,LogisticRegression,0.60425,0.595333,0.587432,0.585034,"Pipeline(steps=[('standardscaler', StandardSca..."
5,Set_2,SVC,0.94775,0.809333,0.798141,0.817687,"Pipeline(steps=[('standardscaler', StandardSca..."
6,Set_2,DecisionTree,0.77425,0.702,0.685567,0.72381,"Pipeline(steps=[('standardscaler', StandardSca..."
7,Set_2,KNN,1.0,0.801333,0.784135,0.820408,"Pipeline(steps=[('standardscaler', StandardSca..."
8,Set_3,LogisticRegression,0.5975,0.594,0.580769,0.616327,"Pipeline(steps=[('standardscaler', StandardSca..."
9,Set_3,SVC,0.953,0.96,0.951807,0.967347,"Pipeline(steps=[('standardscaler', StandardSca..."


In [22]:
model_results_df.pipe_object[11]

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [50]:
model_results = []
target_names=['class 0', 'class 1']

In [51]:
# I will use datasets 2 and 3 appended together as my X train data. That will be 8000 rows.
# This X data has been reduced to only the selected features. We will perform PCA on ot.
Xtrain = X_train_sel_feats_subsets_df[0].append(X_train_sel_feats_subsets_df[1]).append(X_train_sel_feats_subsets_df[2]) 

# y train will be y data for subset 2 and 3 appended together.
ytrain = np.append(np.append(y_train_subsets[0], y_train_subsets[1]),y_train_subsets[2])

# X test and y test are pickled data that I set aside for this final step. They are 1500 rows.
Xtest = Xtest_sel_feats

ytest = ytest

In [52]:
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((12000, 19), (12000,), (1500, 19), (1500,))

In [53]:
for tst_set in tqdm(range(1)):                                     

    # FINAL MODEL: KNN pipes.
    KNN_scaled_pipe = make_pipeline(StandardScaler(), 
                                    PCA(),
                                    KNeighborsClassifier(n_neighbors = 5, weights='uniform'))
                                               
    # Fit and score pipeline.
    KNN_scaled_pipe.fit(Xtrain, ytrain) 
    
    # Mean accuracy score is returned.
    train_score = KNN_scaled_pipe.score(Xtrain,ytrain)
    test_score = KNN_scaled_pipe.score(Xtest,ytest)  
    y_pred = KNN_scaled_pipe.predict(Xtest)
    
    # Calculate precision and recall.   
    recall = recall_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred)
    
    output = {'train_score': train_score, 
              'test_score' : test_score, 
              'recall'     : recall, 
              'precision'  : precision,
              'model_name' : 'KNN'}
    
    model_results.append(output)
    
    #classification reports. 
    class_report_KNN_str = classification_report(ytest, y_pred, target_names=target_names)   

    print("********* KNN ********\n\n", class_report_KNN_str)
    
    

100%|██████████| 1/1 [00:00<00:00,  1.71it/s]

********* KNN ********

              precision    recall  f1-score   support

    class 0       0.88      0.85      0.86       765
    class 1       0.85      0.88      0.86       735

avg / total       0.86      0.86      0.86      1500






In [54]:
pd.DataFrame(model_results)

Unnamed: 0,model_name,precision,recall,test_score,train_score
0,KNN,0.850727,0.87619,0.864,0.873167
