In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
def model_validation(tr,ts, iterations, learning_rate,depth, class_weight, n_splits = 10):
   
    # get current directory
    path = os.getcwd()
    parent = os.path.dirname(path)

    train = pd.read_csv(parent + '\Data\\' + tr + ".csv")
    test = pd.read_csv(parent + '\Data\\' + ts + ".csv")
    
    X_train = train.drop("target", axis=1)
    y_train = train["target"]
    
    X_test  = test.drop("target", axis=1)
    y_test = test["target"]
    
        # initialize Pool
    train_pool = Pool(X_train, 
                      y_train)
        # initialize Pool
    test_pool = Pool(X_test, 
                      y_test)
    
    scores = pd.DataFrame(columns=["Iterations", "Learning Rate", "Depth", "F1", "ROC", "Sensitivity", "Specificty", "ACC" ,  "CV F1", "CV ROC", "ROC Sens", "ROC Spec", "CV ACC"])

    for iteration in iterations:
        for rate in learning_rate:
            for deep in depth:
                    start = time.time()
                    model = CatBoostClassifier(iterations=iteration,
                           depth=deep,
                           learning_rate=rate,
                           class_weights=class_weight,
                           loss_function='Logloss',
                           random_state = random_state,
                           verbose=False)
                    
                    model.fit(train_pool)
                    pred = model.predict(X_test)
                    f1 = (f1_score(pred, y_test))
                    
                    cm = confusion_matrix(y_test, pred)
                    total=sum(sum(cm))
                    roca = round(roc_auc_score(y_test, pred),4)
                    sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                    spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)                    
                    acc = round((cm[1,1] + cm[0,0])/(cm[1,1] + cm[0,0] + cm[0,1] + cm[0,1]))
                    
                    
                    scores_cv = []
                    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
                    skf.get_n_splits(X_train, y_train)
                    for train_index, test_index in skf.split(X_train,y_train):
                        X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
                        y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

                        model_cv = CatBoostClassifier(iterations=iteration,
                                               depth=deep,
                                               learning_rate=rate,
                                               class_weights=class_weight,
                                               loss_function='Logloss',
                                               random_state = random_state,
                                               verbose=False)

                        # initialize Pool
                        train_pool = Pool(X_cv_train, 
                                          y_cv_train)
                            # initialize Pool
                        test_pool = Pool(X_cv_test, 
                                          y_cv_test)

                        model_cv.fit(train_pool)
                        pred = model_cv.predict(X_cv_test)
                        f1_cv = (f1_score(pred, y_cv_test))

                        cm = confusion_matrix(y_cv_test, pred)
                        total=sum(sum(cm))
                        roca_cv = round(roc_auc_score(y_cv_test, pred),4)
                        sens_cv = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                        spec_cv = round(cm[0,0]/(cm[1,0]+cm[0,0]),4) 
                        acc_cv = round((cm[1,1] + cm[0,0])/(cm[1,1] + cm[0,0] + cm[0,1] + cm[0,1]))
                        scores_cv.append([f1_cv, roca_cv, sens_cv, spec_cv, acc_cv])

                      
                    
                    mean = (np.mean(np.array(scores_cv), axis=0))
                    df_new_row = pd.DataFrame(data=([[iteration, rate, deep, f1, roca, sens, spec, acc,mean[0], mean[1], mean[2], mean[3], mean[4]]]), 
                                              columns=["Iterations", "Learning Rate", "Depth",  "F1", "ROC", "Sensitivity", "Specificty", "ACC", 
                                                      "CV F1", "CV ROC", "ROC Sens", "ROC Spec", "CV ACC"])
                    scores = pd.concat([scores,df_new_row], ignore_index=True)
                    end = time.time()
                    print(end - start)
                    
            
    return scores

In [5]:
# tr = "trainset_ALLvsHealthy_80_target_in"
# ts = "testset_ALLvsHealthy_20_target_in"

# tr = "trainset_AMLvsALL_90_target_in"
# ts = "testset_AMLvsALL_10_target_in"

tr = "trainset_ALL_vs_Healthy_150_not_in_literature"
ts = "testset_ALL_vs_Healthy_150_not_in_literature"



# start = time.time()

#  iterations, learning_rate,depth, weights, n_splits = 10
results = model_validation(tr,ts, [100, 200, 300], [0.1,0.2, 0.3, 0.4],[3,4,5], [1 , 0.22])


2.205998182296753
2.7694997787475586
3.5790023803710938
2.310497760772705
2.7274999618530273
3.355999708175659
2.2750003337860107
2.698502540588379
3.3924989700317383
2.1595022678375244
2.629000186920166
3.342499256134033
4.224504232406616
4.862499475479126
6.426497459411621
4.232003688812256
5.8904993534088135
6.657501220703125
4.242999792098999
4.716000556945801
6.322502374649048
3.6630032062530518
4.628504276275635
5.877999782562256
5.589002847671509
7.288501977920532
10.64449691772461
6.149003505706787
7.4309961795806885
9.079002141952515
5.294497728347778
6.649501800537109
9.086000680923462
5.409998655319214
6.770502090454102
8.800496578216553


In [6]:
index = 0
top = 0
i = 0
for row in results.loc[:]["CV F1"]:
    if row > top:
        top = row
        index = i
    i = i + 1
print(results.loc[index])
for item in results.loc[index]:
    print(item)

Iterations            200
Learning Rate         0.1
Depth                   4
F1               0.998559
ROC                0.9965
Sensitivity        0.9986
Specificty         0.9945
ACC                     1
CV F1            0.997472
CV ROC            0.99545
ROC Sens          0.99856
ROC Spec          0.98645
CV ACC                  1
Name: 13, dtype: object
200
0.1
4
0.9985590778097982
0.9965
0.9986
0.9945
1.0
0.9974722584008529
0.99545
0.99856
0.9864499999999999
1.0
