In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix #for model evaluation
import joblib, os


models_transfer = {
    'knn':'KNN',
    'knn_im':'KNN',
    'svm':'SVM',
    'svm_im':'SVM',
    'random':'Random Forest',
    'random_im':'Random Forest',
    'gb':'Gradient Boosting',
    'gb_im':'Gradient Boosting',
    'mlp':'MLP',
    'mlp_im':'MLP',
    'nb':'Naive Bayes',
    'nb_im':'Naive Bayes'
}


models_names = {
    'knn':'Balanced',
    'knn_im':'Imbalanced',
    'svm':'Balanced',
    'svm_im':'Imbalanced',
    'random':'Balanced',
    'random_im':'Imbalanced',
    'gb':'Balanced',
    'gb_im':'Imbalanced',
    'mlp':'Balanced',
    'mlp_im':'Imbalanced',
    'nb':'Balanced',
    'nb_im':'Imbalanced'
}


title = '156selected_156_1130'
KFOLD = 10
N_JOBS = 10


def mkdir(path):
    path = path.strip()
    path = path.rstrip("\\")
    isExists = os.path.exists(path)

    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False


def preprocess(countpath, testsize = 0.2, randomstate = 1):
    x = pd.read_csv('%s' % countpath, index_col=0).T

    # get feature names
    featurenames = np.array(x.index)

    # get label names
    x = x.T
    y = list(np.array(x.index))
    label = sorted(list(set(y)), key = y.index)
    
    df = x
    df['target'] = y

    for classname in label:
        count_temp = len(df[df.target == classname])
        print("Proportion of family {0}: {1:.2f}%".format(classname, (count_temp / (len(df.target)) * 100)))


    y = df.target.values
    x= df.drop(['target'], axis=1)
    
    x = x.dropna(axis=1, how='any')  # drop all rows that have any NaN value
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = testsize, random_state = randomstate)

    return x_train, y_train, x_test, y_test, featurenames, label, df


def counting(label, x_train):
    counting = {}
    for i in label:
        count = 0
        for j in x_train.index:
            if i == j:
                count = count+1        
        counting[i] = count
    print(counting)
    
    return counting


def smote_dict(label, counting):
    dict = {}
    for i in label:
        if i == 'RF00168':
            dict[i] = counting[i] * 6
        elif counting[i] > 1000:
            dict[i] = counting[i]
        elif counting[i] > 700:
            dict[i] = counting[i] * 2
        elif counting[i] > 500:
            dict[i] = counting[i] * 3
        elif counting[i] > 200:
            dict[i] = counting[i] * 5
        else:
            dict[i] = counting[i] * 10
    print(dict)
    return dict


def print_report(name, model, x_test, y_test):
    
    model_pred = model.predict(x_test)
    
    # f1 score
    print("f1 score of {0}: {1:.3f}".format(name, f1_score(y_test,model_pred, average = 'macro')))
    # 模型评估报告
    report = classification_report(y_test, model_pred, target_names= label, output_dict=True)
    
#     dataframe = pd.DataFrame(report).transpose()
#     dataframe.to_csv('./Prediction_output_%s/Classification_report/%s_%s.csv' % (title, title, name), index = False)

    print("Classification report of {0}: \n{1}".format(name, classification_report(y_test, model_pred,
                                            target_names= label, digits = 3)))

    
def load(model_name):
    return joblib.load('./Model/%s_%s.pkl' % (model_name, title))


def export_con_matrix(model_name, model, x_test, y_test):
    pred = model.predict(x_test)
    confusion = pd.DataFrame(confusion_matrix(y_test, pred))
    mkdir('./Confusion_matrix/%s' % title)
    confusion.to_csv('./Confusion_matrix/%s/Confusion_matrix_%s_%s.csv' % (title, model_name, title))
    return confusion


In [6]:
if __name__ == '__main__':
    countpath = './156_new_selected_156.csv'

    x_train, y_train, x_test, y_test, featurenames, label, dataframe = preprocess(countpath, 0.3, 3)
    
    
    mkdir('./Prediction_output_%s/Classification_report' % title)
    count = counting(label, x_train)
    dict = smote_dict(label, count)
   

Proportion of family RF00050: 7.48%
Proportion of family RF00059: 22.34%
Proportion of family RF00162: 7.67%
Proportion of family RF00167: 5.02%
Proportion of family RF00168: 2.11%
Proportion of family RF00174: 26.48%
Proportion of family RF00234: 1.70%
Proportion of family RF00380: 1.34%
Proportion of family RF00504: 13.13%
Proportion of family RF00521: 1.03%
Proportion of family RF00522: 0.72%
Proportion of family RF00634: 1.33%
Proportion of family RF01051: 6.30%
Proportion of family RF01054: 0.24%
Proportion of family RF01055: 1.87%
Proportion of family RF01057: 1.22%
{'RF00050': 827, 'RF00059': 2551, 'RF00162': 852, 'RF00167': 587, 'RF00168': 245, 'RF00174': 2993, 'RF00234': 181, 'RF00380': 152, 'RF00504': 1491, 'RF00521': 133, 'RF00522': 81, 'RF00634': 155, 'RF01051': 701, 'RF01054': 21, 'RF01055': 210, 'RF01057': 148}
{'RF00050': 1654, 'RF00059': 2551, 'RF00162': 1704, 'RF00167': 1761, 'RF00168': 1470, 'RF00174': 2993, 'RF00234': 1810, 'RF00380': 1520, 'RF00504': 1491, 'RF00521'

In [7]:
for model_name in models_names.keys():
    model = load(model_name)
    print(model_name)
    print('----------------------------------------------------------------')
    print_report(model_name, model, x_test, y_test)
    export_con_matrix(model_name, model_x_test, y_test)



knn
----------------------------------------------------------------
f1 score of knn: 0.856
Classification report of knn: 
              precision    recall  f1-score   support

     RF00050      0.849     0.953     0.898       384
     RF00059      0.986     0.944     0.964      1065
     RF00162      0.882     0.943     0.912       389
     RF00167      0.829     0.881     0.854       226
     RF00168      0.484     0.802     0.604        96
     RF00174      0.992     0.814     0.894      1293
     RF00234      0.618     0.862     0.720        94
     RF00380      0.596     0.954     0.734        65
     RF00504      0.987     0.937     0.961       634
     RF00521      0.919     1.000     0.958        34
     RF00522      0.972     1.000     0.986        35
     RF00634      0.759     0.984     0.857        61
     RF01051      0.957     0.987     0.972       319
     RF01054      0.720     1.000     0.837        18
     RF01055      0.647     0.828     0.726        93
     RF01057



knn_im
----------------------------------------------------------------
f1 score of knn_im: 0.848
Classification report of knn_im: 
              precision    recall  f1-score   support

     RF00050      0.903     0.943     0.922       384
     RF00059      0.933     0.983     0.957      1065
     RF00162      0.888     0.941     0.914       389
     RF00167      0.829     0.858     0.843       226
     RF00168      0.775     0.573     0.659        96
     RF00174      0.938     0.940     0.939      1293
     RF00234      0.859     0.585     0.696        94
     RF00380      0.806     0.831     0.818        65
     RF00504      0.962     0.948     0.955       634
     RF00521      0.941     0.941     0.941        34
     RF00522      0.946     1.000     0.972        35
     RF00634      0.902     0.902     0.902        61
     RF01051      0.953     0.947     0.950       319
     RF01054      1.000     0.333     0.500        18
     RF01055      0.919     0.613     0.735        93
   



svm
----------------------------------------------------------------
f1 score of svm: 0.968
Classification report of svm: 
              precision    recall  f1-score   support

     RF00050      0.997     0.961     0.979       384
     RF00059      0.992     0.993     0.993      1065
     RF00162      0.995     0.985     0.990       389
     RF00167      0.925     0.925     0.925       226
     RF00168      0.817     0.792     0.804        96
     RF00174      0.971     0.991     0.981      1293
     RF00234      0.976     0.872     0.921        94
     RF00380      0.970     0.985     0.977        65
     RF00504      0.994     0.995     0.994       634
     RF00521      1.000     1.000     1.000        34
     RF00522      0.972     1.000     0.986        35
     RF00634      1.000     0.984     0.992        61
     RF01051      0.981     0.991     0.986       319
     RF01054      1.000     1.000     1.000        18
     RF01055      0.978     0.946     0.962        93
     RF01057



svm_im
----------------------------------------------------------------
f1 score of svm_im: 0.955
Classification report of svm_im: 
              precision    recall  f1-score   support

     RF00050      0.995     0.958     0.976       384
     RF00059      0.991     0.993     0.992      1065
     RF00162      0.992     0.982     0.987       389
     RF00167      0.932     0.907     0.919       226
     RF00168      0.794     0.802     0.798        96
     RF00174      0.962     0.992     0.977      1293
     RF00234      0.988     0.840     0.908        94
     RF00380      0.968     0.923     0.945        65
     RF00504      0.995     0.995     0.995       634
     RF00521      1.000     0.971     0.985        34
     RF00522      0.972     1.000     0.986        35
     RF00634      0.967     0.967     0.967        61
     RF01051      0.975     0.991     0.983       319
     RF01054      1.000     0.889     0.941        18
     RF01055      0.977     0.925     0.950        93
   



random
----------------------------------------------------------------
f1 score of random: 0.974
Classification report of random: 
              precision    recall  f1-score   support

     RF00050      0.997     0.977     0.987       384
     RF00059      0.993     0.998     0.996      1065
     RF00162      0.982     0.985     0.983       389
     RF00167      0.935     0.889     0.912       226
     RF00168      0.761     0.865     0.810        96
     RF00174      0.990     0.994     0.992      1293
     RF00234      1.000     1.000     1.000        94
     RF00380      0.985     0.985     0.985        65
     RF00504      0.995     0.994     0.994       634
     RF00521      1.000     1.000     1.000        34
     RF00522      0.972     1.000     0.986        35
     RF00634      0.984     0.984     0.984        61
     RF01051      0.991     0.997     0.994       319
     RF01054      1.000     1.000     1.000        18
     RF01055      1.000     0.925     0.961        93
   



random_im
----------------------------------------------------------------
f1 score of random_im: 0.945
Classification report of random_im: 
              precision    recall  f1-score   support

     RF00050      0.989     0.979     0.984       384
     RF00059      0.991     0.998     0.994      1065
     RF00162      0.970     0.982     0.976       389
     RF00167      0.907     0.912     0.909       226
     RF00168      0.796     0.771     0.783        96
     RF00174      0.981     0.995     0.988      1293
     RF00234      1.000     0.957     0.978        94
     RF00380      0.982     0.862     0.918        65
     RF00504      0.989     0.995     0.992       634
     RF00521      1.000     1.000     1.000        34
     RF00522      1.000     1.000     1.000        35
     RF00634      1.000     0.951     0.975        61
     RF01051      0.981     0.997     0.989       319
     RF01054      1.000     0.556     0.714        18
     RF01055      0.988     0.871     0.926     



gb
----------------------------------------------------------------
f1 score of gb: 0.904
Classification report of gb: 
              precision    recall  f1-score   support

     RF00050      0.986     0.932     0.959       384
     RF00059      0.987     0.993     0.990      1065
     RF00162      0.867     0.956     0.910       389
     RF00167      0.881     0.885     0.883       226
     RF00168      0.739     0.708     0.723        96
     RF00174      0.967     0.987     0.977      1293
     RF00234      0.979     0.979     0.979        94
     RF00380      0.869     0.815     0.841        65
     RF00504      0.990     0.983     0.987       634
     RF00521      0.861     0.912     0.886        34
     RF00522      1.000     0.943     0.971        35
     RF00634      0.897     0.852     0.874        61
     RF01051      0.972     0.972     0.972       319
     RF01054      1.000     0.556     0.714        18
     RF01055      1.000     0.731     0.845        93
     RF01057   



gb_im
----------------------------------------------------------------
f1 score of gb_im: 0.892
Classification report of gb_im: 
              precision    recall  f1-score   support

     RF00050      0.992     0.930     0.960       384
     RF00059      0.982     0.997     0.990      1065
     RF00162      0.984     0.949     0.966       389
     RF00167      0.895     0.872     0.883       226
     RF00168      0.784     0.604     0.682        96
     RF00174      0.960     0.994     0.976      1293
     RF00234      0.989     0.968     0.978        94
     RF00380      0.902     0.569     0.698        65
     RF00504      0.880     0.994     0.933       634
     RF00521      0.966     0.824     0.889        34
     RF00522      0.967     0.829     0.892        35
     RF00634      0.980     0.803     0.883        61
     RF01051      0.968     0.944     0.956       319
     RF01054      0.727     0.889     0.800        18
     RF01055      0.971     0.731     0.834        93
     R



mlp
----------------------------------------------------------------
f1 score of mlp: 0.970
Classification report of mlp: 
              precision    recall  f1-score   support

     RF00050      0.995     0.974     0.984       384
     RF00059      0.997     1.000     0.999      1065
     RF00162      0.984     0.977     0.981       389
     RF00167      0.921     0.929     0.925       226
     RF00168      0.842     0.833     0.838        96
     RF00174      0.993     0.995     0.994      1293
     RF00234      1.000     0.979     0.989        94
     RF00380      0.941     0.985     0.962        65
     RF00504      0.992     0.997     0.994       634
     RF00521      0.971     1.000     0.986        34
     RF00522      1.000     1.000     1.000        35
     RF00634      0.984     1.000     0.992        61
     RF01051      0.994     0.994     0.994       319
     RF01054      1.000     0.889     0.941        18
     RF01055      0.957     0.946     0.951        93
     RF01057



f1 score of nb: 0.771
Classification report of nb: 
              precision    recall  f1-score   support

     RF00050      0.980     0.896     0.936       384
     RF00059      0.993     0.958     0.975      1065
     RF00162      0.968     0.856     0.909       389
     RF00167      0.893     0.668     0.765       226
     RF00168      0.478     0.896     0.623        96
     RF00174      0.926     0.924     0.925      1293
     RF00234      0.851     0.915     0.882        94
     RF00380      0.563     0.892     0.690        65
     RF00504      0.988     0.909     0.947       634
     RF00521      0.313     0.912     0.466        34
     RF00522      0.742     0.657     0.697        35
     RF00634      0.495     0.885     0.635        61
     RF01051      0.989     0.834     0.905       319
     RF01054      0.455     0.556     0.500        18
     RF01055      0.760     0.817     0.788        93
     RF01057      0.571     0.880     0.693        50

    accuracy                



f1 score of nb_im: 0.705
Classification report of nb_im: 
              precision    recall  f1-score   support

     RF00050      0.985     0.870     0.924       384
     RF00059      1.000     0.935     0.967      1065
     RF00162      0.976     0.743     0.844       389
     RF00167      0.943     0.664     0.779       226
     RF00168      0.455     0.833     0.588        96
     RF00174      0.962     0.866     0.912      1293
     RF00234      0.712     0.894     0.792        94
     RF00380      0.453     0.969     0.618        65
     RF00504      0.996     0.825     0.903       634
     RF00521      0.136     0.912     0.237        34
     RF00522      0.719     0.657     0.687        35
     RF00634      0.244     0.885     0.383        61
     RF01051      0.987     0.690     0.812       319
     RF01054      0.455     0.556     0.500        18
     RF01055      0.782     0.849     0.814        93
     RF01057      0.376     0.880     0.527        50

    accuracy          