In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
def model_validation(tr,ts, iterations, learning_rate,depth, class_weight, n_splits = 10):
   
    # get current directory
    path = os.getcwd()
    parent = os.path.dirname(path)

    train = pd.read_csv(parent + '\Data\\' + tr + ".csv")
    test = pd.read_csv(parent + '\Data\\' + ts + ".csv")
    
    X_train = train.drop("target", axis=1)
    y_train = train["target"]
    
    X_test  = test.drop("target", axis=1)
    y_test = test["target"]
    
        # initialize Pool
    train_pool = Pool(X_train, 
                      y_train)
        # initialize Pool
    test_pool = Pool(X_test, 
                      y_test)
    
    scores = pd.DataFrame(columns=["Iterations", "Learning Rate", "Depth", "F1", "ROC", "Sensitivity", "Specificty", "ACC" ,  "CV F1", "CV ROC", "ROC Sens", "ROC Spec", "CV ACC"])
    
#     from sklearn.linear_model import LogisticRegression
#     clf = LogisticRegression(random_state=0, penalty="l1", solver='liblinear')
#     clf.fit(X_train, y_train)
#     pred = clf.predict(X_test)
#     f1 = (f1_score(pred, y_test))

#     cm = confusion_matrix(y_test, pred)
#     total=sum(sum(cm))
#     roca = round(roc_auc_score(y_test, pred),4)
#     sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
#     spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)      
    
#     print(f1, roca, sens, spec)

    for iteration in iterations:
        for rate in learning_rate:
            for deep in depth:
                    start = time.time()
                    model = CatBoostClassifier(iterations=iteration,
                           depth=deep,
                           learning_rate=rate,
                           class_weights=class_weight,
                           loss_function='Logloss',
                           random_state = random_state,
                           verbose=False)
                    
                    model.fit(train_pool)
                    pred = model.predict(X_test)
                    f1 = (f1_score(pred, y_test))
                    
                    cm = confusion_matrix(y_test, pred)
                    total=sum(sum(cm))
                    roca = round(roc_auc_score(y_test, pred),4)
                    sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                    spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)                    
                    acc = round((cm[1,1] + cm[0,0])/(cm[1,1] + cm[0,0] + cm[0,1] + cm[0,1]))
                    
                    
                    scores_cv = []
                    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
                    skf.get_n_splits(X_train, y_train)
                    for train_index, test_index in skf.split(X_train,y_train):
                        X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
                        y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

                        model_cv = CatBoostClassifier(iterations=iteration,
                                               depth=deep,
                                               learning_rate=rate,
                                               class_weights=class_weight,
                                               loss_function='Logloss',
                                               random_state = random_state,
                                               verbose=False)

                        # initialize Pool
                        train_pool = Pool(X_cv_train, 
                                          y_cv_train)
                            # initialize Pool
                        test_pool = Pool(X_cv_test, 
                                          y_cv_test)

                        model_cv.fit(train_pool)
                        pred = model_cv.predict(X_cv_test)
                        f1_cv = (f1_score(pred, y_cv_test))

                        cm = confusion_matrix(y_cv_test, pred)
                        total=sum(sum(cm))
                        roca_cv = round(roc_auc_score(y_cv_test, pred),4)
                        sens_cv = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                        spec_cv = round(cm[0,0]/(cm[1,0]+cm[0,0]),4) 
                        acc_cv = round((cm[1,1] + cm[0,0])/(cm[1,1] + cm[0,0] + cm[0,1] + cm[0,1]))
                        scores_cv.append([f1_cv, roca_cv, sens_cv, spec_cv, acc_cv])

                      
                    
                    mean = (np.mean(np.array(scores_cv), axis=0))
                    df_new_row = pd.DataFrame(data=([[iteration, rate, deep, f1, roca, sens, spec, acc,mean[0], mean[1], mean[2], mean[3], mean[4]]]), 
                                              columns=["Iterations", "Learning Rate", "Depth",  "F1", "ROC", "Sensitivity", "Specificty", "ACC", 
                                                      "CV F1", "CV ROC", "ROC Sens", "ROC Spec", "CV ACC"])
                    scores = pd.concat([scores,df_new_row], ignore_index=True)
                    end = time.time()
                    print(end - start)
                    
            
    return scores

In [15]:
# tr = "trainset_ALLvsHealthy_80_target_in"
# ts = "testset_ALLvsHealthy_20_target_in"

# tr = "trainset_AMLvsALL_90_target_in"
# ts = "testset_AMLvsALL_10_target_in"

# tr = "trainset_ALL_vs_Healthy_200"
# ts = "testset_ALL_vs_Healthy_200"

tr = "trainset_AML_vs_ALL_125"
ts = "testset_AML_vs_ALL_125"

# start = time.time()

#  iterations, learning_rate,depth, weights, n_splits = 10
results = model_validation(tr,ts, [100, 200, 300], [0.1,0.2, 0.3, 0.4],[3,4,5], [1 , 1])


3.5659663677215576
4.5739569664001465
5.631942987442017
3.4029667377471924
4.496455669403076
5.699943542480469
3.405966281890869
4.151958703994751
5.622948408126831
3.826957941055298
4.498457670211792
5.721946954727173
7.016429424285889
8.788413286209106
13.25336742401123
7.172929048538208
9.781903266906738
11.529885053634644
7.911422491073608
8.54841685295105
11.265884399414062
6.8929362297058105
8.413912773132324
11.431883573532104
10.793893337249756
13.810361385345459
16.929332494735718
10.342897653579712
12.888370275497437
19.095313787460327
10.761391878128052
14.849856853485107
16.671834707260132
10.195399522781372
12.84186863899231
17.073829650878906


In [16]:
index = 0
top = 0
i = 0
for row in results.loc[:]["CV F1"]:
    if row > top:
        top = row
        index = i
    i = i + 1
print(results.loc[index])
for item in results.loc[index]:
    print(item)

Iterations            300
Learning Rate         0.1
Depth                   3
F1               0.995181
ROC                0.9947
Sensitivity        0.9952
Specificty         0.9942
ACC                     1
CV F1             0.99316
CV ROC            0.99248
ROC Sens          0.99305
ROC Spec          0.99203
CV ACC                  1
Name: 24, dtype: object
300
0.1
3
0.9951807228915662
0.9947
0.9952
0.9942
1.0
0.9931600426163966
0.9924800000000001
0.99305
0.99203
1.0
