In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict



In [4]:
df = pd.read_csv('protine_ksctriad.csv')
df.head()

Unnamed: 0,0,0.1,1,0.2,0.3,0.4,0.6,0.7,1.1,0.8,...,0.1301,0.1302,0.1303,0.1304,0.1305,0.1306,0.1307,0.1308,0.1309,0.1310
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0,0,0,0
3,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0,0,0,0,0,0,0,0,0
4,0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.5,...,0.0,0,0,0,0,0,0,0,0,0


In [5]:
y = df['0']
x = df.drop('0', axis=1) 

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3)

In [7]:
total_Metics = []
total_Metics = pd.DataFrame(total_Metics)
total_Metics['Classifier'] = 'Classifier'
total_Metics['Accuracy'] = 'Accuracy'
total_Metics['mcc'] = 'mcc'
# total_Metics['auc'] = 'auc'
total_Metics['Kappa'] = 'Kappa'
total_Metics['precision'] = 'precision'
total_Metics['recall'] = 'recall'
total_Metics['f1'] = 'f1'
total_Metics['sensitivity'] = 'sensitivity'
total_Metics['specificity'] = 'specificity'

models = [RandomForestClassifier(n_estimators = 200, max_depth = 10),
          XGBClassifier(n_estimators = 200,max_depth = 10, learning_rate = 0.1),
          LGBMClassifier(learning_rate = 0.1,max_depth = 10,random_state = 50),
          GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.5, random_state = 50),
          AdaBoostClassifier(n_estimators = 200, learning_rate = 0.1, random_state = 50)]
cv = KFold(n_splits=5, random_state=1, shuffle=True)
for model in models:
  from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score, matthews_corrcoef, roc_auc_score, cohen_kappa_score
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  # model.fit(xtrain, ytrain)
  # pred = model.predict(xtest)
  pred = cross_val_predict(model, xtrain, ytrain, cv=cv, n_jobs=-1)

  # cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(ytrain, pred)
  mcc = matthews_corrcoef(ytrain, pred)
  cm1 = confusion_matrix(ytrain, pred)
  kappa = cohen_kappa_score(ytrain, pred)
  f1 = f1_score(ytrain, pred)
  precision_score = precision_score(ytrain, pred)
  recall_score = recall_score(ytrain, pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  # y_pred = np.argmax(y_pred, axis=0)
  # auc = roc_auc_score(y, y_pred, multi_class='ovr')
  total_Metics.loc[len(total_Metics.index)] = [model,Accuracy, mcc, kappa, precision_score,recall_score, f1, sensitivity,specificity]

print(total_Metics)

                                          Classifier  Accuracy       mcc  \
0  RandomForestClassifier(max_depth=10, n_estimat...  0.614845 -0.062442   
1  XGBClassifier(base_score=None, booster=None, c...  0.555667 -0.006562   
2      LGBMClassifier(max_depth=10, random_state=50)  0.562688  0.004330   
3  GradientBoostingClassifier(learning_rate=0.5, ...  0.573721  0.043121   
4  AdaBoostClassifier(learning_rate=0.1, n_estima...  0.595787  0.008585   

      Kappa  precision    recall        f1  sensitivity  specificity  
0 -0.024068   0.173913  0.010840  0.020408     0.969745     0.010840  
1 -0.006408   0.364964  0.271003  0.311042     0.722930     0.271003  
2  0.004206   0.373585  0.268293  0.312303     0.735669     0.268293  
3  0.042397   0.402778  0.314363  0.353120     0.726115     0.314363  
4  0.007117   0.380282  0.146341  0.211350     0.859873     0.146341  


In [8]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(xtrain, ytrain)
total_Metics = []
total_Metics = pd.DataFrame(total_Metics)
total_Metics['Classifier'] = 'Classifier'
total_Metics['Accuracy'] = 'Accuracy'
total_Metics['mcc'] = 'mcc'
# total_Metics['auc'] = 'auc'
total_Metics['Kappa'] = 'Kappa'
total_Metics['precision'] = 'precision'
total_Metics['recall'] = 'recall'
total_Metics['f1'] = 'f1'
total_Metics['sensitivity'] = 'sensitivity'
total_Metics['specificity'] = 'specificity'

models = [RandomForestClassifier(n_estimators = 200, max_depth = 10),
          XGBClassifier(n_estimators = 200,max_depth = 10, learning_rate = 0.1),
          LGBMClassifier(learning_rate = 0.1,max_depth = 10,random_state = 50),
          GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.5, random_state = 50),
          AdaBoostClassifier(n_estimators = 200, learning_rate = 0.1, random_state = 50)]
cv = KFold(n_splits=5, random_state=1, shuffle=True)
for model in models:
  from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score, matthews_corrcoef, roc_auc_score, cohen_kappa_score
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  # model.fit(xtrain, ytrain)
  # pred = model.predict(xtest)
  pred = cross_val_predict(model, x_ros, y_ros, cv=cv, n_jobs=-1)

  # cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y_ros, pred)
  mcc = matthews_corrcoef(y_ros, pred)
  cm1 = confusion_matrix(y_ros, pred)
  kappa = cohen_kappa_score(y_ros, pred)
  f1 = f1_score(y_ros, pred)
  precision_score = precision_score(y_ros, pred)
  recall_score = recall_score(y_ros, pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  # y_pred = np.argmax(y_pred, axis=0)
  # auc = roc_auc_score(y, y_pred, multi_class='ovr')
  total_Metics.loc[len(total_Metics.index)] = [model,Accuracy, mcc, kappa, precision_score,recall_score, f1, sensitivity,specificity]

print(total_Metics)

                                          Classifier  Accuracy       mcc  \
0  RandomForestClassifier(max_depth=10, n_estimat...  0.707006  0.414267   
1  XGBClassifier(base_score=None, booster=None, c...  0.689490  0.380608   
2      LGBMClassifier(max_depth=10, random_state=50)  0.699841  0.401006   
3  GradientBoostingClassifier(learning_rate=0.5, ...  0.682325  0.367422   
4  AdaBoostClassifier(learning_rate=0.1, n_estima...  0.604299  0.208739   

      Kappa  precision    recall        f1  sensitivity  specificity  
0  0.414013   0.700000  0.724522  0.712050     0.689490     0.724522  
1  0.378981   0.673469  0.735669  0.703196     0.643312     0.735669  
2  0.399682   0.684831  0.740446  0.711553     0.659236     0.740446  
3  0.364650   0.662411  0.743631  0.700675     0.621019     0.743631  
4  0.208599   0.600614  0.622611  0.611415     0.585987     0.622611  
