In [299]:
# This script is used to create the confusion matrix for the best classifier found from the modelling script
# In each case, HGB was the best classifier, thus that is the only model shown here
# The model is the same HGB model script from the script containing all the classifiers
# Both Cyst vs pRCC and Tumor Grade/Type models are found below, make sure to run the correct one and only the correct one


# import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
import random

from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold, StratifiedShuffleSplit

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [300]:
#import the data
df = pd.read_excel('/Users/RyeAnne/Documents/LoewLab/KidneyCancer/FeatureMatrices/FM_CM_Grade.xls') # import .xls file
df = df.sample(frac = 1) 
df.shape
df.head # view the first few rows of data

<bound method NDFrame.head of     label      Mean  Variance   Skewness    Kurtosis    Energy   Entropy  \
7       2  1.021675  0.092855  14.176613  204.739909  0.989841  0.074144   
15      3  1.105425  0.364261   5.678055   33.955817  0.940815  0.366153   
2       2  1.148525  0.595365   5.123756   27.906581  0.928883  0.453395   
8       2  1.005850  0.022916  25.957949  677.234672  0.997004  0.022784   
12      3  1.103425  0.360378   5.813531   35.829910  0.941719  0.377513   
14      3  1.027200  0.105660  11.930552  143.939337  0.986088  0.093635   
19      3  1.081850  0.336751   7.027086   50.996743  0.961251  0.247869   
16      3  1.041200  0.168203   9.923162  100.274341  0.980082  0.131322   
10      2  1.063125  0.226590   7.559896   59.231871  0.965166  0.229476   
18      3  1.123025  0.441540   5.329484   30.000273  0.934015  0.412132   
21      3  1.055100  0.253064   9.126123   85.195224  0.976217  0.160908   
3       2  1.062325  0.242191   8.107291   69.355176  0.96

In [301]:
#split into predictor and response variables
y = df['label'].ravel() # set the dependent variable, which is the column labelled 'label'
x = df.drop(['label'], axis = 1) # set the predictor variables, which is all columns but 'label'
le = preprocessing.LabelEncoder() # turns result into binary "0" or "1" result
y = le.fit_transform(y) # applies le to results

In [302]:
#select the level of features you want to use
selection = SelectPercentile(f_classif, percentile=80).fit(x, y) # makes sure to set the percentile of features you want to use
x = selection.transform(x) # edits predictor variables to only use top x-percentile of features that you specified in the previous line
scaler = StandardScaler().fit(x) # normalizes data
x = scaler.transform(x) # applied normalization
scoring ={ "Accuracy":'accuracy',"AUC":'roc_auc'} # specifies to use AUC as the method of % accuracy, as the data is imbalanced- only use this method to create the model
s = StratifiedShuffleSplit(n_splits=3, random_state=10) # n_splits=3 for Grade/Type, n_splits=10 for Cyst vs pRCC

#selection = SelectPercentile(f_classif, percentile=20).fit(x, y)
#x = selection.transform(x)
#sum(np.array(selection.pvalues_ < 0.05))

In [303]:
# split data into test and train - not used since data set is too small to separate into testing and training, must use whole set for test-okay since previous script used test/train sets and AUC method
#xt, xv, yt, yv = train_test_split(x, y, test_size = 0.25)

In [279]:
# ***Use for Cyst Runs***


# build model-hgb


params = {'learning_rate': [0.001, 0.003, 0.01, 0.03, 0.05, 0.1, 0.2, 0.3], # set first set of parameters
         'max_iter': range(10, 301, 20)}
hgb_search2 = GridSearchCV(HistGradientBoostingClassifier(n_iter_no_change=50),  # first model using variation is first set of parameters
                                   params,cv=s.split(x, y), scoring=scoring, refit='AUC')
hgb_search2.fit(x, y) # fit first model
print('HGB:', hgb_search2.best_score_, hgb_search2.best_params_) # print results of first model

#params = {'l2_regularization': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]} -found not to work well
#hgb_search3 = GridSearchCV(hgb_search2.best_estimator_, params,cv=s.split(x, y), scoring=scoring, refit='AUC')
#hgb_search3.fit(x, y)
#print('HGB:',hgb_search3.best_score_, hgb_search3.best_params_)

params = {'max_depth': [None, 4,5, 6, 7, 8, 9, 10], # second set of parameters to vary
                  'min_samples_leaf': range(2, 41, 2),
                  'max_leaf_nodes': range(2, 41, 2)
                 }
hgb_search1 = GridSearchCV(hgb_search2.best_estimator_, # build model
                                   params, cv=s.split(x, y), scoring=scoring, refit='AUC')
HGB = hgb_search1.fit(x, y) # set final model
cvres = hgb_search1.cv_results_ # results
print('HBG:', hgb_search1.best_score_, hgb_search1.best_params_) # display final accuracy results


HGB: 0.5 {'learning_rate': 0.001, 'max_iter': 10}
HBG: 1.0 {'max_depth': None, 'max_leaf_nodes': 16, 'min_samples_leaf': 4}


In [294]:
# ***Use for Grade/Type Runs***

# Build Model

params = {'learning_rate': [0.001, 0.003, 0.01, 0.03], # set parameters to vary-use exhaustive search
                 'max_iter': [40,60,80,100],
                 'max_depth': [None, 2,4,6],
                  'min_samples_leaf': [1, 2, 3],
                  'max_leaf_nodes': range(2, 41, 5)
                 }
hgb_search2 = GridSearchCV(HistGradientBoostingClassifier(n_iter_no_change=50), # model
                                   params,cv=s.split(x, y), scoring=scoring, refit='AUC')
HGB = hgb_search2.fit(x, y) # fit the model to the data
print('HGB:', hgb_search2.best_score_, hgb_search2.best_params_) # print the model results

HGB: 1.0 {'learning_rate': 0.001, 'max_depth': None, 'max_iter': 60, 'max_leaf_nodes': 32, 'min_samples_leaf': 3}


In [295]:
#prediction on the validation set
y_pred_hgb = HGB.predict(x) # using the model, predict the results of the data


In [304]:
print(y) # display actual results
print('')
print(y_pred_hgb) # display predicted results

# Note-if you have the algorithm classify all data as one type (all 0 or all 1), redo entire method as this was due to an imbalance in the training






In [307]:
#make the confusion matrix
cm_hgb = confusion_matrix(y, (y_pred_hgb>0.5)) # make confusion matrix using truth and predicted results
print('The Confusion Matrix is: ','\n', cm_hgb) # print the confusion matric

In [308]:
#calculate the accuracy on test set
predict_accuracy_on_HGB_test_set = (cm_hgb[0,0] + cm_hgb[1,1])/(cm_hgb[0,0] + cm_hgb[1,1]+cm_hgb[1,0] + cm_hgb[0,1]) # calculate accuracy using the numbers in the positions of the matrix
print('The Accuracy on Test Set is: ', predict_accuracy_on_HGB_test_set) # print the accuracy