In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [6]:
def model_validation(tr,ts, iterations, learning_rate,depth, weights, n_splits = 10):
   
    # get current directory
    path = os.getcwd()
    parent = os.path.dirname(path)

    train = pd.read_csv(parent + '\Data\\' + tr + ".csv")
    test = pd.read_csv(parent + '\Data\\' + ts + ".csv")
    
    X_train = train.drop("target", axis=1)
    y_train = train["target"]
    
    X_test  = test.drop("target", axis=1)
    y_test = test["target"]
    
        # initialize Pool
    train_pool = Pool(X_train, 
                      y_train)
        # initialize Pool
    test_pool = Pool(X_test, 
                      y_test)
    
    scores = pd.DataFrame(columns=["Iterations", "Learning Rate", "Depth", "Weights", "F1", "ROC", "Sensitivity", "Specificty", "CV F1", "CV ROC", "ROC Sens", "ROC Spec"])
    
#     from sklearn.linear_model import LogisticRegression
#     clf = LogisticRegression(random_state=0, penalty="l1", solver='liblinear')
#     clf.fit(X_train, y_train)
#     pred = clf.predict(X_test)
#     f1 = (f1_score(pred, y_test))

#     cm = confusion_matrix(y_test, pred)
#     total=sum(sum(cm))
#     roca = round(roc_auc_score(y_test, pred),4)
#     sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
#     spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)      
    
#     print(f1, roca, sens, spec)

    for iteration in iterations:
        for rate in learning_rate:
            for deep in depth:
                for weight in weights:
                    start = time.time()
                    model = CatBoostClassifier(iterations=iteration,
                           depth=deep,
                           learning_rate=rate,
                           class_weights=[weight, 1-weight],
                           loss_function='Logloss',
                           verbose=False)
                    
                    model.fit(train_pool)
                    pred = model.predict(X_test)
                    f1 = (f1_score(pred, y_test))
                    
                    cm = confusion_matrix(y_test, pred)
                    total=sum(sum(cm))
                    roca = round(roc_auc_score(y_test, pred),4)
                    sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                    spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)                    
                    
                    
                    
                    scores_cv = []
                    skf = StratifiedKFold(n_splits=n_splits, shuffle=False, random_state=random_state)
                    skf.get_n_splits(X_train, y_train)
                    for train_index, test_index in skf.split(X_train,y_train):
                        X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
                        y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

                        model_cv = CatBoostClassifier(iterations=iteration,
                                               depth=deep,
                                               learning_rate=rate,
                                               class_weights=[weight, 1-weight],
                                               loss_function='Logloss',
                                               verbose=False)

                        # initialize Pool
                        train_pool = Pool(X_cv_train, 
                                          y_cv_train)
                            # initialize Pool
                        test_pool = Pool(X_cv_test, 
                                          y_cv_test)

                        model_cv.fit(train_pool)
                        pred = model_cv.predict(X_cv_test)
                        f1_cv = (f1_score(pred, y_cv_test))

                        cm = confusion_matrix(y_cv_test, pred)
                        total=sum(sum(cm))
                        roca_cv = round(roc_auc_score(y_cv_test, pred),4)
                        sens_cv = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                        spec_cv = round(cm[0,0]/(cm[1,0]+cm[0,0]),4) 
                        
                        scores_cv.append([f1_cv, roca_cv, sens_cv, spec_cv])

                      
                    
                    mean = (np.mean(np.array(scores_cv), axis=0))
                    df_new_row = pd.DataFrame(data=([[iteration, rate, deep, [weight, 1 - weight], f1, roca, sens, spec, mean[0], mean[1], mean[2], mean[3]]]), 
                                              columns=["Iterations", "Learning Rate", "Depth", "Weights", "F1", "ROC", "Sensitivity", "Specificty",
                                                      "CV F1", "CV ROC", "ROC Sens", "ROC Spec"])
                    scores = pd.concat([scores,df_new_row], ignore_index=True)
                    end = time.time()
                    print(end - start)
                    
            
    return scores

In [9]:
# tr = "TrainSet-82-63_4041_TargetIn"
# ts = "TestSet-18-63_4041_TargetIn"

tr = "trainset_AMLvsALL_90_target_in"
ts = "testset_AMLvsALL_10_target_in"

start = time.time()

#  iterations, learning_rate,depth, weights, n_splits = 10
model_validation(tr,ts, [100 ,150 ,200], [0.3, 0.4, 0.5],[4, 5, 6],[0.5])




289.45698976516724
438.64589953422546
755.9163100719452
279.07991433143616
438.06736278533936
749.8079817295074
280.2817506790161
434.0712959766388
749.7330024242401
398.50178575515747
635.3937730789185
1105.9776275157928
399.12737011909485
633.9917314052582
1103.8743615150452
396.80253171920776
635.1834197044373
1106.257149219513
519.5830745697021
836.6107802391052
1459.5951764583588
521.0824549198151
832.0578467845917
1456.1621718406677
516.8490765094757
834.6027143001556
1459.5946943759918


Unnamed: 0,Iterations,Learning Rate,Depth,Weights,F1,ROC,Sensitivity,Specificty,CV F1,CV ROC,ROC Sens,ROC Spec
0,100,0.3,4,"[0.5, 0.5]",0.993969,0.9935,0.9952,0.9914,0.990246,0.98901,0.98776,0.9913
1,100,0.3,5,"[0.5, 0.5]",0.991576,0.9906,0.9904,0.9913,0.989979,0.9887,0.9875,0.99098
2,100,0.3,6,"[0.5, 0.5]",0.992754,0.9923,0.9952,0.9885,0.989311,0.98794,0.98645,0.99067
3,100,0.4,4,"[0.5, 0.5]",0.989144,0.9882,0.9903,0.9856,0.990235,0.98906,0.98829,0.9907
4,100,0.4,5,"[0.5, 0.5]",0.987981,0.9865,0.9856,0.9884,0.989305,0.98802,0.98722,0.9897
5,100,0.4,6,"[0.5, 0.5]",0.992771,0.9921,0.9928,0.9914,0.989029,0.98777,0.98748,0.98877
6,100,0.5,4,"[0.5, 0.5]",0.991556,0.9909,0.9928,0.9885,0.989837,0.98861,0.98774,0.99035
7,100,0.5,5,"[0.5, 0.5]",0.993954,0.9937,0.9976,0.9886,0.989173,0.98787,0.98724,0.9894
8,100,0.5,6,"[0.5, 0.5]",0.992771,0.9921,0.9928,0.9914,0.99049,0.98949,0.98985,0.98944
9,150,0.3,4,"[0.5, 0.5]",0.993969,0.9935,0.9952,0.9914,0.989839,0.98862,0.98775,0.99035
