# Selection of Right Threshold

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
# roc curve and auc score
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=2000, n_classes=2, weights=[1,1], random_state=1)

In [6]:
print(X.shape)
print(y)

(2000, 20)
[0 0 0 ... 1 1 0]


In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [9]:
## Apply RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
ytrain_pred = rf_model.predict_proba(X_train)
print('RF train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = rf_model.predict_proba(X_test)
print('RF test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

RF train roc-auc: 1.0
RF test roc-auc: 0.9811944444444445


In [10]:
# Apply logistic regression
from sklearn.linear_model import LogisticRegression
log_classifier=LogisticRegression()
log_classifier.fit(X_train, y_train)
ytrain_pred = log_classifier.predict_proba(X_train)
print('Logistic train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = log_classifier.predict_proba(X_test)
print('Logistic test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

Logistic train roc-auc: 0.9863568922694498
Logistic test roc-auc: 0.9885777777777777


In [11]:
# Apply Adabooster
from sklearn.ensemble import AdaBoostClassifier
ada_classifier=AdaBoostClassifier()
ada_classifier.fit(X_train, y_train)
ytrain_pred = ada_classifier.predict_proba(X_train)
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = ada_classifier.predict_proba(X_test)
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

Adaboost train roc-auc: 0.9975081174960356
Adaboost test roc-auc: 0.9826111111111111


In [13]:
# Apply KNN Classifire
from sklearn.neighbors import KNeighborsClassifier
knn_classifier=KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
ytrain_pred = knn_classifier.predict_proba(X_train)
print('KNNClassifier train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = knn_classifier.predict_proba(X_test)
print('KNNClassfier test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

KNNClassifier train roc-auc: 0.981670071491109
KNNClassfier test roc-auc: 0.9426111111111111


### now we will selecting the threshold

In [21]:
pred=[]
for model in [rf_model,log_classifier,ada_classifier,knn_classifier]:
    pred.append(pd.Series(model.predict_proba(X_test)[:,1]))
final_predict=pd.concat(pred,axis=1).mean(axis=1)
print('Ensemble test roc-auc: {}'.format(roc_auc_score(y_test,final_predict)))

Ensemble test roc-auc: 0.9846333333333334


In [22]:
pd.concat(pred,axis=1)

Unnamed: 0,0,1,2,3
0,1.00,0.991861,0.559186,1.0
1,0.00,0.000008,0.463282,0.0
2,0.97,0.966929,0.538202,0.8
3,0.93,0.761539,0.509875,0.8
4,0.64,0.779443,0.490344,0.4
...,...,...,...,...
595,0.00,0.024239,0.461121,0.0
596,0.00,0.000003,0.441377,0.0
597,0.99,0.984385,0.532403,1.0
598,0.00,0.001147,0.441720,0.2


In [24]:
final_predict

0      0.887762
1      0.115823
2      0.818783
3      0.750353
4      0.577447
         ...   
595    0.121340
596    0.110345
597    0.876697
598    0.160717
599    0.834857
Length: 600, dtype: float64

In [27]:
fpr,tpr,threshold=roc_curve(y_test,final_predict)
threshold

array([1.9109413 , 0.9109413 , 0.9010034 , 0.89827475, 0.80698866,
       0.80662833, 0.79998136, 0.7980558 , 0.78627646, 0.78287124,
       0.72201656, 0.72137362, 0.71878988, 0.71643711, 0.65787883,
       0.65243537, 0.6065152 , 0.59864346, 0.59656376, 0.59425354,
       0.5896569 , 0.58550386, 0.58058969, 0.57986186, 0.55855932,
       0.55139283, 0.52865858, 0.50366892, 0.4596034 , 0.45472765,
       0.45240225, 0.38590767, 0.37629719, 0.35245613, 0.34836612,
       0.24401541, 0.24140421, 0.20789681, 0.20598417, 0.11790921,
       0.1178351 , 0.10632697])

In [29]:
from sklearn.metrics import accuracy_score
accuracy_ls = []
for thres in threshold:
    y_pred = np.where(final_predict>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))
    
accuracy_ls = pd.concat([pd.Series(threshold), pd.Series(accuracy_ls)],
                        axis=1)
accuracy_ls.columns = ['thresholds', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls

Unnamed: 0,thresholds,accuracy
29,0.454728,0.961667
30,0.452402,0.96
28,0.459603,0.96
27,0.503669,0.958333
26,0.528659,0.958333
25,0.551393,0.958333
24,0.558559,0.958333
23,0.579862,0.955
22,0.58059,0.953333
21,0.585504,0.948333
