In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV



In [2]:
df = pd.read_csv('protine_apaac.csv')
y = df['0']
x = df.drop('0', axis=1)
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x, y)
xtrain, xtest, ytrain, ytest = train_test_split(x_ros, y_ros, test_size=0.3)

In [3]:
ytrain.value_counts()

0
1    641
0    624
Name: count, dtype: int64

Balancing

In [4]:
model_params = {
    # 'random_forest': {
    #     'model': RandomForestClassifier(),
    #     'params':{
    #         'n_estimators':range(10,1000),
    #         'max_depth':range(1,100)
    #     }
    # },
    'XGBclassifire': {
        'model': XGBClassifier(),
        'params':{
            'n_estimators':range(10,1000),
            'max_depth':range(1,100),
            'learning_rate':[0.1]
        }
    },
    'LGBMClassifier':{
        'model': LGBMClassifier(),
        'params':{
            'learning_rate':[0.1],
            'max_depth':range(1,100),
            'random_state':[25,50,100,150,200]
        }
    },
    'GradientBoostingClassifier':{
        'model': GradientBoostingClassifier(),
        'params':{
            'n_estimators':range(10,1000),
            'learning_rate':[0.1],
            'random_state':range(10,500)
        }
    },
    'AdaBoostClassifier':{
        'model': AdaBoostClassifier(),
        'params':{
            'n_estimators':range(10,1000),
            'learning_rate':[0.1],
            'random_state':range(10,500)
        }
    }
    # 'logistic_regression': {
    #     'model': LogisticRegression(solver='liblinear',multi_class='auto'),
    #     'params':{
    #         'C':[1,5,10]
    #     }
    # }
}

In [None]:
scores = []
for y,x in model_params.items():
    from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score, matthews_corrcoef, roc_auc_score, cohen_kappa_score
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  # model.fit(xtrain, ytrain)
  # pred = model.predict(xtest)
    pred = cross_val_predict(x['model'], xtrain, ytrain, cv=5, n_jobs=-1)

  # cm1 = confusion_matrix(y, y_pred)
  # report performance
    Accuracy = accuracy_score(ytrain, pred)
    mcc = matthews_corrcoef(ytrain, pred)
    cm1 = confusion_matrix(ytrain, pred)
    kappa = cohen_kappa_score(ytrain, pred)
    f1 = f1_score(ytrain, pred)
    precision_score = precision_score(ytrain, pred)
    recall_score = recall_score(ytrain, pred)
    sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])


for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'], cv=5, return_train_score=False)
    clf.fit(xtrain, ytrain)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'Accuracy': Accuracy,
        'mcc': mcc,
        'kappa': kappa,
        'f1': f1,
        'precision_score': precision_score,
        'recall_score': recall_score,
        'sensitivity': sensitivity,
        'specificity': specificity
    })

In [None]:
df = pd.DataFrame(scores)
df