In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
def model_validation(tr,ts, iterations, learning_rate,depth, n_splits = 10):
   
    # get current directory
    path = os.getcwd()
    parent = os.path.dirname(path)

    train = pd.read_csv(parent + '\Data\\' + tr + ".csv")
    test = pd.read_csv(parent + '\Data\\' + ts + ".csv")
    
    X_train = train.drop("target", axis=1)
    y_train = train["target"]
    
    X_test  = test.drop("target", axis=1)
    y_test = test["target"]
    
        # initialize Pool
    train_pool = Pool(X_train, 
                      y_train)
        # initialize Pool
    test_pool = Pool(X_test, 
                      y_test)
    
    scores = pd.DataFrame(columns=["Iterations", "Learning Rate", "Depth", "F1", "ROC", "Sensitivity", "Specificty", "CV F1", "CV ROC", "ROC Sens", "ROC Spec"])
    
#     from sklearn.linear_model import LogisticRegression
#     clf = LogisticRegression(random_state=0, penalty="l1", solver='liblinear')
#     clf.fit(X_train, y_train)
#     pred = clf.predict(X_test)
#     f1 = (f1_score(pred, y_test))

#     cm = confusion_matrix(y_test, pred)
#     total=sum(sum(cm))
#     roca = round(roc_auc_score(y_test, pred),4)
#     sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
#     spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)      
    
#     print(f1, roca, sens, spec)

    for iteration in iterations:
        for rate in learning_rate:
            for deep in depth:
                    start = time.time()
                    model = CatBoostClassifier(iterations=iteration,
                           depth=deep,
                           learning_rate=rate,
                           loss_function='Logloss',
                           verbose=False)
                    
                    model.fit(train_pool)
                    pred = model.predict(X_test)
                    f1 = (f1_score(pred, y_test))
                    
                    cm = confusion_matrix(y_test, pred)
                    total=sum(sum(cm))
                    roca = round(roc_auc_score(y_test, pred),4)
                    sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                    spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)                    
                    
                    
                    
                    scores_cv = []
                    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
                    skf.get_n_splits(X_train, y_train)
                    for train_index, test_index in skf.split(X_train,y_train):
                        X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
                        y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

                        model_cv = CatBoostClassifier(iterations=iteration,
                                               depth=deep,
                                               learning_rate=rate,
                                               loss_function='Logloss',
                                               verbose=False)

                        # initialize Pool
                        train_pool = Pool(X_cv_train, 
                                          y_cv_train)
                            # initialize Pool
                        test_pool = Pool(X_cv_test, 
                                          y_cv_test)

                        model_cv.fit(train_pool)
                        pred = model_cv.predict(X_cv_test)
                        f1_cv = (f1_score(pred, y_cv_test))

                        cm = confusion_matrix(y_cv_test, pred)
                        total=sum(sum(cm))
                        roca_cv = round(roc_auc_score(y_cv_test, pred),4)
                        sens_cv = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                        spec_cv = round(cm[0,0]/(cm[1,0]+cm[0,0]),4) 
                        
                        scores_cv.append([f1_cv, roca_cv, sens_cv, spec_cv])

                      
                    
                    mean = (np.mean(np.array(scores_cv), axis=0))
                    df_new_row = pd.DataFrame(data=([[iteration, rate, deep, f1, roca, sens, spec, mean[0], mean[1], mean[2], mean[3]]]), 
                                              columns=["Iterations", "Learning Rate", "Depth",  "F1", "ROC", "Sensitivity", "Specificty",
                                                      "CV F1", "CV ROC", "ROC Sens", "ROC Spec"])
                    scores = pd.concat([scores,df_new_row], ignore_index=True)
                    end = time.time()
                    print(end - start)
                    
            
    return scores

In [8]:
# tr = "TrainSet-82-63_4041_TargetIn"
# ts = "TestSet-18-63_4041_TargetIn"

tr = "trainset_ALL_vs_Healthy_reduced_100_80_target_in"
ts = "testset_ALL_vs_Healthy_reduced_100_20_target_in"

start = time.time()

#  iterations, learning_rate,depth, weights, n_splits = 10
results = model_validation(tr,ts, [200 ,250 ,300], [0.3, 0.5, 0.7],[3, 4, 5, 6, 7])




4.115838050842285
5.327543258666992
8.257932901382446
14.572219371795654
26.394052267074585
4.484755277633667
6.229729652404785
8.893667697906494
17.679968118667603
34.30027961730957
6.078241586685181
7.9074060916900635
12.154560327529907
18.921199560165405
32.09035778045654
7.333389043807983
10.306929111480713
14.468948602676392
23.191667556762695
39.31138038635254
6.83648943901062
9.297272682189941
14.55629849433899
23.556450843811035
39.98536825180054
6.939128875732422
9.841952085494995
14.60714340209961
25.95722985267639
43.85172247886658
8.560542345046997
11.824506282806396
19.134143352508545
29.23634958267212
47.877609729766846
9.789632081985474
13.713744640350342
19.7791485786438
29.01488947868347
49.431463956832886
7.919154644012451
10.835352182388306
16.05118489265442
24.84377121925354
46.76972508430481


In [4]:
results

Unnamed: 0,Iterations,Learning Rate,Depth,F1,ROC,Sensitivity,Specificty,CV F1,CV ROC,ROC Sens,ROC Spec
0,100,0.1,4,0.99784,0.9938,0.9971,0.9944,0.996934,0.99341,0.9975,0.98656
1,100,0.1,5,0.997122,0.991,0.9957,0.9944,0.996756,0.99219,0.99677,0.98772
2,100,0.1,6,0.997118,0.993,0.9971,0.989,0.997291,0.99477,0.9982,0.98658
3,100,0.2,4,0.997122,0.991,0.9957,0.9944,0.996753,0.99271,0.99714,0.98659
4,100,0.2,5,0.997837,0.9958,0.9986,0.989,0.99729,0.99527,0.99856,0.98521
5,100,0.2,6,0.99784,0.9938,0.9971,0.9944,0.996755,0.99219,0.99677,0.98777
6,100,0.3,4,0.99784,0.9938,0.9971,0.9944,0.997295,0.99374,0.99748,0.98913
7,100,0.3,5,0.997122,0.991,0.9957,0.9944,0.996935,0.99239,0.99677,0.98919
8,100,0.3,6,0.998559,0.9965,0.9986,0.9945,0.997473,0.99444,0.99784,0.9893
9,150,0.1,4,0.997122,0.991,0.9957,0.9944,0.996934,0.99341,0.9975,0.98656


In [9]:
index = 0
top = 0
i = 0
for row in results.loc[:]["CV F1"]:
    if row > top:
        top = row
        index = i
    i = i + 1
results.loc[index]

Iterations            200
Learning Rate         0.5
Depth                   7
F1                 0.9964
ROC                0.9903
Sensitivity        0.9957
Specificty         0.9889
CV F1            0.997477
CV ROC            0.99343
ROC Sens          0.99712
ROC Spec          0.99184
Name: 9, dtype: object