In [1]:
# Read gridsearched models selected.

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split   
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,precision_recall_curve,confusion_matrix 
from sklearn.metrics import (precision_score,accuracy_score,roc_auc_score,roc_curve, 
                             precision_recall_curve,recall_score,make_scorer,auc) 
from scipy.stats import boxcox, skew
import pickle
%matplotlib inline

In [5]:
class Read_pickled_madelon_Datasets(object):
   
    # Initializing object.
    def __init__(self):
       
        # Read pickled data from NB1.
        self.data_dir = '../dataNB1/'    

        self.read_all_data_from_files()
        
    def read_all_data_from_files(self):
        self.X_train_data_df_1 = pd.read_pickle(self.data_dir +'X_train_data_df_1.p')
        self.X_train_data_df_2 = pd.read_pickle(self.data_dir +'X_train_data_df_2.p')
        self.X_train_data_df_3 = pd.read_pickle(self.data_dir +'X_train_data_df_3.p')

        self.X_valid_data_df = pd.read_pickle(self.data_dir +'X_valid_data_df.p')
        
        self.y_train_data_1 = pd.read_pickle(self.data_dir + 'y_train_data_1.p')
        self.y_train_data_2 = pd.read_pickle(self.data_dir + 'y_train_data_2.p')
        self.y_train_data_3 = pd.read_pickle(self.data_dir + 'y_train_data_3.p')
        
        self.y_valid_data = pd.read_pickle(self.data_dir + 'y_valid_data.p')
        
    def get_X_train_data_sets(self):
        return(self.X_train_data_df_1, self.X_train_data_df_2, self.X_train_data_df_3)
    
    def get_X_valid_data(self):
        return(self.X_valid_data_df)
    

    def get_X_test_data(self):
        return(self.X_test_data_df)
       
    def get_y_train_data(self):        
        # y data should be a 1D array of labels. It is now read as a dataframe of 1 column with each
        # element as a list of one int.  We want it to be just an int not a list.
        self.y_train_data_1 = np.array([val[0] for val in self.y_train_data_1.values])
        self.y_train_data_2 = np.array([val[0] for val in self.y_train_data_2.values])
        self.y_train_data_3 = np.array([val[0] for val in self.y_train_data_3.values])
        
        return(self.y_train_data_1, self.y_train_data_2, self.y_train_data_3)

    def get_y_valid_data(self):        
        # y data should be a 1D array of labels. It is now read as a dataframe of 1 column with each
        # element as a list of one int.  We want it to be just an int not a list.
        self.y_valid_data = np.array([val[0] for val in self.y_valid_data.values])

        return(self.y_valid_data)


In [6]:
# Read in DB Madelon Data. For this notebook we only need the X and y train data.
read_data = Read_pickled_madelon_Datasets()

In [7]:
# X train data.
X_train_data_df_1, X_train_data_df_2, X_train_data_df_3 = \
                                                        read_data.get_X_train_data_sets()
# y train data.
y_train_1, y_train_2, y_train_3 = read_data.get_y_train_data()

# X train datasets in a list.
X_train_subsets = [X_train_data_df_1, X_train_data_df_2, X_train_data_df_3]

# y train datasets in a list.
y_train_subsets = [y_train_1, y_train_2, y_train_3]

# Read pickled X validate data.
X_validate = read_data.get_X_valid_data()

# Read pickled y validate data.
y_validate = read_data.get_y_valid_data()

In [11]:
# Read pickled data subsets with only the selected features.

X_train_sel_feats_subsets_df=[]
X_train_sel_feats_subsets_df.append(pd.read_pickle('../dataNB3/X_train_sel_feats_subsets_df1.p'))
X_train_sel_feats_subsets_df.append(pd.read_pickle('../dataNB3/X_train_sel_feats_subsets_df2.p'))
X_train_sel_feats_subsets_df.append(pd.read_pickle('../dataNB3/X_train_sel_feats_subsets_df3.p'))

Xtest_sel_feats = pd.read_pickle('../dataNB3/X_test_sel_feats_df.p')

In [12]:
model_results_df = pd.read_pickle('../dataNB3/FinalModels_GOOD_FINAL.obj')

In [13]:
# Gridsearch results from Notebook3. The gridsearch was performed on the entire train set UCI data 
# of 2000 rows. The test data used, was UCI's X valid and y valida data.

model_results_df

Unnamed: 0,dataset,model,sc_tr_acc_score,sc_tst_acc_score,sc_precision,sc_recall,pipe_object
0,Set_1,LogisticRegression,0.614,0.591667,0.591362,0.593333,"Pipeline(steps=[('standardscaler', StandardSca..."
1,Set_1,SVC,0.987,0.915,0.902913,0.93,"Pipeline(steps=[('standardscaler', StandardSca..."
2,Set_1,DecisionTree,0.87,0.795,0.767372,0.846667,"Pipeline(steps=[('standardscaler', StandardSca..."
3,Set_1,KNN,1.0,0.916667,0.905844,0.93,"Pipeline(steps=[('standardscaler', StandardSca..."
4,Set_2,LogisticRegression,0.614,0.591667,0.591362,0.593333,"Pipeline(steps=[('standardscaler', StandardSca..."
5,Set_2,SVC,0.987,0.915,0.902913,0.93,"Pipeline(steps=[('standardscaler', StandardSca..."
6,Set_2,DecisionTree,0.87,0.801667,0.773414,0.853333,"Pipeline(steps=[('standardscaler', StandardSca..."
7,Set_2,KNN,1.0,0.916667,0.905844,0.93,"Pipeline(steps=[('standardscaler', StandardSca..."
8,Set_3,LogisticRegression,0.614,0.591667,0.591362,0.593333,"Pipeline(steps=[('standardscaler', StandardSca..."
9,Set_3,SVC,0.987,0.915,0.902913,0.93,"Pipeline(steps=[('standardscaler', StandardSca..."


In [None]:
# Final Best models retrieved from gridsearch.

In [12]:
model_results_df.pipe_object[11]

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [13]:
model_results_df.pipe_object[9]

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [None]:
# Features selected from feauture selection methods in NB2 and NB3. (20 features)

In [14]:
features_selected = X_train_sel_feats_subsets_df[0].columns
features_selected

Int64Index([ 28,  48,  64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433,
            442, 451, 453, 455, 472, 475, 493],
           dtype='int64')

In [15]:
# Read in the entire pickled UCI training dataset.
X_train_data_df = pd.read_pickle('../dataNB1/' +'X_train_data_df.p')
y_train_data_df = pd.read_pickle('../dataNB1/' +'y_train_data.p')
y_train_data_arr = np.array([val[0] for val in y_train_data_df.values])

In [16]:
# Use all of UCI's train data, subsetted by the selected features.
Xtrain = X_train_data_df[features_selected]

# y train will be y data for subset 2 and 3 appended together.
ytrain = y_train_data_arr

# X test and y test are pickled data that I set aside for this final step. They are 1500 rows.
Xtest = X_validate[features_selected]

ytest = y_validate

In [22]:
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((2000, 20), (2000,), (600, 20), (600,))

In [18]:
Xtrain.head()

Unnamed: 0,28,48,64,105,128,153,241,281,318,336,338,378,433,442,451,453,455,472,475,493
0,459,440,648,181,452,575,434,517,414,658,628,419,533,568,463,471,630,515,401,485
1,475,499,488,431,473,404,551,435,469,469,528,526,442,463,474,311,582,465,549,338
2,491,460,485,593,487,585,474,535,506,465,431,464,569,503,481,606,424,485,454,650
3,472,529,415,698,493,591,569,526,458,398,377,553,565,447,472,545,456,457,602,572
4,472,429,387,451,475,448,538,456,462,385,509,424,462,536,472,426,465,500,560,435


In [19]:
Xtest.head()

Unnamed: 0,28,48,64,105,128,153,241,281,318,336,338,378,433,442,451,453,455,472,475,493
0,490,436,450,420,472,409,541,432,513,418,523,423,427,444,486,300,548,454,538,259
1,491,544,629,541,480,567,456,519,522,626,484,580,559,414,484,523,547,439,429,534
2,479,437,426,500,480,485,517,471,482,383,485,432,485,526,477,479,457,494,517,495
3,472,447,574,314,463,405,425,426,458,560,700,443,427,649,472,453,525,540,386,463
4,469,501,499,395,471,417,537,434,451,483,609,517,448,518,470,368,570,487,527,343


In [20]:
ytrain[:25]

array([-1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1,
        1, -1,  1, -1,  1, -1,  1, -1])

In [21]:
ytest[:25]

array([-1, -1, -1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,
        1, -1, -1, -1, -1, -1,  1,  1])

In [25]:
final_model_results = []
target_names=['class 0', 'class 1']

In [26]:
for tst_set in tqdm(range(1)):                                     

    # FINAL MODEL: KNN pipes.
    KNN_scaled_pipe = make_pipeline(StandardScaler(), 
                                    PCA(),
                                    KNeighborsClassifier(n_neighbors = 5, weights='uniform'))
                                               
    # Fit and score pipeline.
    KNN_scaled_pipe.fit(Xtrain, ytrain) 
    
    # Mean accuracy score is returned.
    train_score = KNN_scaled_pipe.score(Xtrain,ytrain)
    test_score = KNN_scaled_pipe.score(Xtest,ytest)  
    y_pred = KNN_scaled_pipe.predict(Xtest)
    
    # Calculate precision and recall.   
    recall = recall_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred)
    
    output = {'train_score': train_score, 
              'test_score' : test_score, 
              'recall'     : recall, 
              'precision'  : precision,
              'model_name' : 'KNN'}
    
    final_model_results.append(output)
    
    #classification reports. 
    class_report_KNN_str = classification_report(ytest, y_pred, target_names=target_names)       
    print("********* KNN ********\n\n", class_report_KNN_str)



100%|██████████| 1/1 [00:00<00:00, 13.98it/s]

********* KNN ********

              precision    recall  f1-score   support

    class 0       0.92      0.90      0.91       300
    class 1       0.90      0.93      0.91       300

avg / total       0.91      0.91      0.91       600






In [27]:
pd.DataFrame(final_model_results)

Unnamed: 0,model_name,precision,recall,test_score,train_score
0,KNN,0.902597,0.926667,0.913333,0.9365
