In [42]:
# importing the dependencies
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')


from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.model_selection import GridSearchCV,cross_val_score

from sklearn.metrics import roc_auc_score,roc_curve


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve,learning_curve

In [43]:
# get the data
# Loading the dataset
df = pd.read_pickle('../DataFrames/Scaled_label_encoded')

In [44]:
# seperating only the train set
train_set,test_set =  df.iloc[:28322],df.iloc[28322:]

In [45]:
# Making X and y
X = train_set.drop(columns=['label','id'],axis=1)
y = train_set['label']
X = X.iloc[:,[10,4,8,12,6,17,7,5,1,18,13,11,14,0]]

In [46]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.3,random_state=42,shuffle=True
)

#### Stacking

In [47]:
from sklearn.ensemble import VotingClassifier
# init the models
rfc = RandomForestClassifier(n_jobs=-1,random_state=42)
svc = SVC(kernel='rbf',C=100,gamma='auto',\
    probability=True)
Dtree = DecisionTreeClassifier(max_depth=5,random_state=42)    

In [48]:
voting_clf = VotingClassifier(
    estimators=[
        ('svc',svc),
        ('rfc',rfc),
        ('DT',Dtree)
    ], voting='soft',n_jobs=-1
)

In [49]:
voting_clf.fit(X_train,y_train)

In [50]:
final_prediction = voting_clf.predict_proba(X_test)[:,1]

### Thresholding

In [51]:
fpr, tpr, thresholds = roc_curve(y_test, final_prediction)
thresholds

array([1.94268834, 0.94268834, 0.92782875, ..., 0.01811402, 0.01788681,
       0.01481738])

In [52]:
from sklearn.metrics import accuracy_score
accuracy_ls = []
for thres in thresholds:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))
    
accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)],
                        axis=1)
accuracy_ls.columns = ['thresholds', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()

Unnamed: 0,thresholds,accuracy
325,0.52016,0.88137
323,0.521427,0.88137
321,0.526129,0.88137
326,0.519507,0.881252
324,0.521406,0.881252


In [55]:
y_pred = (voting_clf.predict_proba(X_test)[:,1] >= 0.52).astype(bool)

In [56]:
roc_auc_score(y_test,y_pred)

0.7049010106603905