In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [30]:
def model_validation(tr,ts, iterations, learning_rate,depth, class_weight, n_splits = 10):
   
    # get current directory
    path = os.getcwd()
    parent = os.path.dirname(path)

    train = pd.read_csv(parent + '\Data\\' + tr + ".csv")
    test = pd.read_csv(parent + '\Data\\' + ts + ".csv")
    
    X_train = train.drop("target", axis=1)
    y_train = train["target"]
    
    X_test  = test.drop("target", axis=1)
    y_test = test["target"]
    
        # initialize Pool
    train_pool = Pool(X_train, 
                      y_train)
        # initialize Pool
    test_pool = Pool(X_test, 
                      y_test)
    
    scores = pd.DataFrame(columns=["Iterations", "Learning Rate", "Depth", "F1", "ROC", "Sensitivity", "Specificty", "ACC" ,  "CV F1", "CV ROC", "ROC Sens", "ROC Spec", "CV ACC"])
    
#     from sklearn.linear_model import LogisticRegression
#     clf = LogisticRegression(random_state=0, penalty="l1", solver='liblinear')
#     clf.fit(X_train, y_train)
#     pred = clf.predict(X_test)
#     f1 = (f1_score(pred, y_test))

#     cm = confusion_matrix(y_test, pred)
#     total=sum(sum(cm))
#     roca = round(roc_auc_score(y_test, pred),4)
#     sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
#     spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)      
    
#     print(f1, roca, sens, spec)

    for iteration in iterations:
        for rate in learning_rate:
            for deep in depth:
                    start = time.time()
                    model = CatBoostClassifier(iterations=iteration,
                           depth=deep,
                           learning_rate=rate,
                           class_weights=class_weight,
                           loss_function='Logloss',
                           random_state = random_state,
                           verbose=False)
                    
                    model.fit(train_pool)
                    pred = model.predict(X_test)
                    f1 = (f1_score(pred, y_test))
                    
                    cm = confusion_matrix(y_test, pred)
                    total=sum(sum(cm))
                    roca = round(roc_auc_score(y_test, pred),4)
                    sens = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                    spec = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)                    
                    acc = round((cm[1,1] + cm[0,0])/(cm[1,1] + cm[0,0] + cm[0,1] + cm[0,1]))
                    
                    
                    scores_cv = []
                    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
                    skf.get_n_splits(X_train, y_train)
                    for train_index, test_index in skf.split(X_train,y_train):
                        X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
                        y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

                        model_cv = CatBoostClassifier(iterations=iteration,
                                               depth=deep,
                                               learning_rate=rate,
                                               class_weights=class_weight,
                                               loss_function='Logloss',
                                               random_state = random_state,
                                               verbose=False)

                        # initialize Pool
                        train_pool = Pool(X_cv_train, 
                                          y_cv_train)
                            # initialize Pool
                        test_pool = Pool(X_cv_test, 
                                          y_cv_test)

                        model_cv.fit(train_pool)
                        pred = model_cv.predict(X_cv_test)
                        f1_cv = (f1_score(pred, y_cv_test))

                        cm = confusion_matrix(y_cv_test, pred)
                        total=sum(sum(cm))
                        roca_cv = round(roc_auc_score(y_cv_test, pred),4)
                        sens_cv = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
                        spec_cv = round(cm[0,0]/(cm[1,0]+cm[0,0]),4) 
                        acc_cv = round((cm[1,1] + cm[0,0])/(cm[1,1] + cm[0,0] + cm[0,1] + cm[0,1]))
                        scores_cv.append([f1_cv, roca_cv, sens_cv, spec_cv, acc_cv])

                      
                    
                    mean = (np.mean(np.array(scores_cv), axis=0))
                    df_new_row = pd.DataFrame(data=([[iteration, rate, deep, f1, roca, sens, spec, acc,mean[0], mean[1], mean[2], mean[3], mean[4]]]), 
                                              columns=["Iterations", "Learning Rate", "Depth",  "F1", "ROC", "Sensitivity", "Specificty", "ACC", 
                                                      "CV F1", "CV ROC", "ROC Sens", "ROC Spec", "CV ACC"])
                    scores = pd.concat([scores,df_new_row], ignore_index=True)
                    end = time.time()
                    print(end - start)
                    
            
    return scores

In [45]:
# tr = "trainset_ALLvsHealthy_80_target_in"
# ts = "testset_ALLvsHealthy_20_target_in"

# tr = "trainset_AMLvsALL_90_target_in"
# ts = "testset_AMLvsALL_10_target_in"

tr = "trainset_ALL_vs_Healthy_200"
ts = "testset_ALL_vs_Healthy_200"

# start = time.time()

#  iterations, learning_rate,depth, weights, n_splits = 10
results = model_validation(tr,ts, [100, 200, 300], [0.1,0.2, 0.3, 0.4],[3,4,5], [1 , 0.26])




3.5337979793548584
4.735724925994873
7.47764778137207
3.5929012298583984
5.154695987701416
7.807961463928223
3.90434193611145
5.439246654510498
7.846724987030029
3.7010645866394043
5.392460346221924
7.820653438568115
7.979942321777344
10.427429676055908
16.817477226257324
7.439601421356201
10.628585577011108
17.514943838119507
8.297813415527344
12.673267602920532
19.587703943252563
8.791434049606323
12.608812093734741
18.64809799194336
12.4891939163208
18.72781538963318
26.983361959457397
12.638514995574951
17.992551803588867
26.249799251556396
12.192177772521973
16.716254949569702
26.149813175201416
12.688116788864136
19.533531665802002
27.85391354560852


In [47]:
index = 0
top = 0
i = 0
for row in results.loc[:]["CV F1"]:
    if row > top:
        top = row
        index = i
    i = i + 1
print(results.loc[index])
for item in results.loc[index]:
    print(item)

Iterations            200
Learning Rate         0.1
Depth                   3
F1               0.998557
ROC                0.9986
Sensitivity           1.0
Specificty         0.9891
ACC                     1
CV F1            0.997829
CV ROC            0.99683
ROC Sens          0.99928
ROC Spec          0.98659
CV ACC                1.0
Name: 12, dtype: object
200
0.1
3
0.9985569985569985
0.9986
1.0
0.9891
1
0.9978293498797303
0.9968300000000001
0.99928
0.9865900000000002
1.0
