In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [7]:
def model_validation(tr,ts, iterations, learning_rate,depth, weights, n_splits = 10):
   
    # get current directory
    path = os.getcwd()
    parent = os.path.dirname(path)

    train = pd.read_csv(parent + '\Data\\' + tr + ".csv")
    test = pd.read_csv(parent + '\Data\\' + ts + ".csv")
    
    X_train = train.drop("target", axis=1)
    y_train = train["target"]
    
    X_test  = test.drop("target", axis=1)
    y_test = test["target"]
    
        # initialize Pool
    train_pool = Pool(X_train, 
                      y_train)
        # initialize Pool
    test_pool = Pool(X_test, 
                      y_test)
    
    scores = pd.DataFrame(columns=["Iterations", "Learning Rate", "Depth", "Weights", "F1", "ROC", "Sensitivity", "Specificty", "CV F1", "CV ROC", "ROC Sens", "ROC Spec"])
    
#     from sklearn.linear_model import LogisticRegression
#     clf = LogisticRegression(random_state=0, penalty="l1", solver='liblinear')
#     clf.fit(X_train, y_train)
#     pred = clf.predict(X_test)
#     f1 = (f1_score(pred, y_test))

#     cm = confusion_matrix(y_test, pred)
#     total=sum(sum(cm))
#     roca = round(roc_auc_score(y_test, pred),4)
#     sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
#     spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)      
    
#     print(f1, roca, sens, spec)

    for iteration in iterations:
        for rate in learning_rate:
            for deep in depth:
                for weight in weights:
                    start = time.time()
                    model = CatBoostClassifier(iterations=iteration,
                           depth=deep,
                           learning_rate=rate,
                           class_weights=[weight, 1-weight],
                           loss_function='Logloss',
                           verbose=False)
                    
                    model.fit(train_pool)
                    pred = model.predict(X_test)
                    f1 = (f1_score(pred, y_test))
                    
                    cm = confusion_matrix(y_test, pred)
                    total=sum(sum(cm))
                    roca = round(roc_auc_score(y_test, pred),4)
                    sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                    spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)                    
                    
                    
                    
                    scores_cv = []
                    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
                    skf.get_n_splits(X_train, y_train)
                    for train_index, test_index in skf.split(X_train,y_train):
                        X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
                        y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

                        model_cv = CatBoostClassifier(iterations=iteration,
                                               depth=deep,
                                               learning_rate=rate,
                                               class_weights=[weight, 1-weight],
                                               loss_function='Logloss',
                                               verbose=False)

                        # initialize Pool
                        train_pool = Pool(X_cv_train, 
                                          y_cv_train)
                            # initialize Pool
                        test_pool = Pool(X_cv_test, 
                                          y_cv_test)

                        model_cv.fit(train_pool)
                        pred = model_cv.predict(X_cv_test)
                        f1_cv = (f1_score(pred, y_cv_test))

                        cm = confusion_matrix(y_cv_test, pred)
                        total=sum(sum(cm))
                        roca_cv = round(roc_auc_score(y_cv_test, pred),4)
                        sens_cv = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                        spec_cv = round(cm[0,0]/(cm[1,0]+cm[0,0]),4) 
                        
                        scores_cv.append([f1_cv, roca_cv, sens_cv, spec_cv])

                      
                    
                    mean = (np.mean(np.array(scores_cv), axis=0))
                    df_new_row = pd.DataFrame(data=([[iteration, rate, deep, [weight, 1 - weight], f1, roca, sens, spec, mean[0], mean[1], mean[2], mean[3]]]), 
                                              columns=["Iterations", "Learning Rate", "Depth", "Weights", "F1", "ROC", "Sensitivity", "Specificty",
                                                      "CV F1", "CV ROC", "ROC Sens", "ROC Spec"])
                    scores = pd.concat([scores,df_new_row], ignore_index=True)
                    end = time.time()
                    print(end - start)
                    
            
    return scores

In [8]:
# tr = "TrainSet-82-63_4041_TargetIn"
# ts = "TestSet-18-63_4041_TargetIn"

# tr = "trainset_AMLvsALL_90_target_in"
# ts = "testset_AMLvsALL_10_target_in"

tr = "trainset_AML_vs_ALL_reduced_90_target_in"
ts = "testset_AML_vs_ALL_reduced_90_target_in"

start = time.time()

#  iterations, learning_rate,depth, weights, n_splits = 10
model_validation(tr,ts, [100 ,150 ,200], [0.3, 0.4, 0.5],[4, 5, 6],[0.5])




12.406059503555298
20.23355007171631
23.611684560775757
9.11481761932373
9.195554494857788
13.214655637741089
9.38290548324585
12.33101749420166
22.14576244354248
16.560292720794678
20.52517533302307
28.548643350601196
11.546115159988403
16.279457807540894
25.11981225013733
11.905156373977661
20.167061805725098
24.606186151504517
15.346948385238647
22.601549863815308
33.21216630935669
15.979260206222534
21.41113042831421
28.23548150062561
13.74423623085022
18.627177476882935
27.804630756378174


Unnamed: 0,Iterations,Learning Rate,Depth,Weights,F1,ROC,Sensitivity,Specificty,CV F1,CV ROC,ROC Sens,ROC Spec
0,100,0.3,4,"[0.5, 0.5]",0.995181,0.9947,0.9952,0.9942,0.992221,0.99144,0.99198,0.99108
1,100,0.3,5,"[0.5, 0.5]",0.99759,0.9974,0.9976,0.9971,0.991423,0.99049,0.99037,0.99105
2,100,0.3,6,"[0.5, 0.5]",0.99759,0.9974,0.9976,0.9971,0.991543,0.99074,0.99171,0.98982
3,100,0.4,4,"[0.5, 0.5]",0.995169,0.9949,0.9976,0.9914,0.991548,0.99077,0.99198,0.98949
4,100,0.4,5,"[0.5, 0.5]",0.993969,0.9935,0.9952,0.9914,0.991149,0.99023,0.99063,0.99012
5,100,0.4,6,"[0.5, 0.5]",0.995169,0.9949,0.9976,0.9914,0.990617,0.98964,0.98986,0.98979
6,100,0.5,4,"[0.5, 0.5]",0.995181,0.9947,0.9952,0.9942,0.990071,0.98913,0.99037,0.98789
7,100,0.5,5,"[0.5, 0.5]",0.99759,0.9974,0.9976,0.9971,0.991017,0.99008,0.99038,0.9901
8,100,0.5,6,"[0.5, 0.5]",0.995181,0.9947,0.9952,0.9942,0.99235,0.99163,0.99251,0.99077
9,150,0.3,4,"[0.5, 0.5]",0.996381,0.9961,0.9976,0.9943,0.992487,0.99176,0.99251,0.99109
