In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import time
from matplotlib import cm as cm
import warnings

In [2]:
data = pd.read_csv('data.csv', index_col=False)
data['diagnosis'] = data['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)
data = data.set_index('id')
del data['Unnamed: 32']

Y = data['diagnosis'].values
X = data.drop('diagnosis', axis=1).values
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.20, random_state=42)

In [3]:
# group / ensemble of models 
estimator = []
estimator.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression(solver ='lbfgs', multi_class ='multinomial', max_iter = 200))])))
#estimator.append()
#estimator.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
#estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC(C=2.0, kernel='rbf', probability=True))])))
estimator.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier(n_neighbors = 11))])))
#weight = [0.923092, 0.982456, 0.956140]

In [4]:
weight = []
num_folds = 10
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    for name, model in estimator:
        cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
        weight.append(cv_results.mean())
        #scores = cross_val_score(model, X_train, Y_train, cv=kfold) #####ERROR

In [5]:
weight

[0.973719806763285, 0.973719806763285, 0.9581159420289855]

In [6]:
# Voting Classifier with soft voting 
vot_soft = VotingClassifier(estimators = estimator, voting ='soft', weights=weight) 
vot_soft.fit(X_train, Y_train)

y_pred = vot_soft.predict(X_train) 
  
# using accuracy_score 
score = accuracy_score(Y_train, y_pred) 
print("\nTrain Data: \n")
print("Soft Voting Score: " + str(score))
print(confusion_matrix(Y_train, y_pred))

y_pred = vot_soft.predict(X_test) 
  
# using accuracy_score 
score = accuracy_score(Y_test, y_pred) 
print("\nTest Data: \n")
print("Soft Voting Score: " + str(score))
print(confusion_matrix(Y_test, y_pred))


Train Data: 

Soft Voting Score: 0.9934065934065934
[[286   0]
 [  3 166]]

Test Data: 

Soft Voting Score: 0.9736842105263158
[[70  1]
 [ 2 41]]


In [9]:
threshold = 0.2 #decreasing the threshold
y_pred = (vot_soft.predict_proba(X_train)[:,1] >= threshold).astype(bool)
ry_pred = []
for i in y_pred:
    if i == True:
        ry_pred.append(1)
    else:
        ry_pred.append(0)
ry_pred = np.array(ry_pred)

score = accuracy_score(Y_train, y_pred) 
print("\nTrain Data: \n")
print("Soft Voting Score: " + str(score))
print(confusion_matrix(Y_train, y_pred))

y_pred = (vot_soft.predict_proba(X_test)[:,1] >= threshold).astype(bool)
ry_pred = []
for i in y_pred:
    if i == True:
        ry_pred.append(1)
    else:
        ry_pred.append(0)
ry_pred = np. array(ry_pred)

score = accuracy_score(Y_test, y_pred) 
print("\nTest Data: \n")
print("Soft Voting Score: " + str(score))
print(confusion_matrix(Y_test, y_pred))


Train Data: 

Soft Voting Score: 0.978021978021978
[[278   8]
 [  2 167]]

Test Data: 

Soft Voting Score: 0.956140350877193
[[67  4]
 [ 1 42]]
