### Load the dataset

In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from catboost import Pool, cv
random_state = 7
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
import numpy as np

In [None]:
df = pd.read_csv(r"C:\Users\Romar\OneDrive\Documenten\Thesis\Genes\Data\AML_Healthy_5052x12717_allinfo.csv")

In [None]:
df.head()

### Look at the target

In [4]:
print(df["target"].value_counts())
print()
print(df["Disease"].value_counts())
print()
print(df["Tissue"].value_counts())

1    4145
0     907
Name: target, dtype: int64

AML        4082
healthy     907
AMKL         63
Name: Disease, dtype: int64

BM            2628
BM or PBMC    1321
PBMC          1103
Name: Tissue, dtype: int64


### distribution between the dataset

In [5]:
print(df["Dataset"].value_counts())
print()
print(df["GSE"].value_counts())

2.0    3348
1.0    1131
3.0     573
Name: Dataset, dtype: int64

GSE13159               612
GSE37642               525
GSE14468               512
GSE1159                288
GSE61804               286
Lavallee (GSE67039)    263
GSE17855               237
GSE46480               196
GSE68833               158
GSE15434               156
GSE35784               130
GSE10358               120
GSE87072               119
GSE12417               118
GSE63270               104
GSE43176               104
GSE22845                93
Lavallee (GSE62190)     82
GSE34860                78
GSE83449                72
Garzon (GSE63646)       71
GSE9476                 64
GSE13501                56
Lavallee (GSE49642)     43
GSE42057                42
GSE18323                39
GSE51082                37
GSE22707                36
GSE27562                31
GSE27383                29
Lavallee (GSE52656)     27
GSE7757                 24
GSE12662                24
Lavallee (GSE66917)     22
GSE50772         

### Data preperation

In [6]:
dataset = df.drop(["Filename","id", "Dataset", "GSE", "Condition", "FAB", "Disease"], axis=1)
# dataset['Tissue'] = df['Tissue'].replace({'BM or PBMC':1, 'BM':2, 'PBMC': 3})

In [7]:
dataset.head()

Unnamed: 0,Tissue,PAX8,CCL5,MMP14,DTX2P1-UPK3BP1-PMS2P11,BAD,PRPF8,CAPNS1,RPL35,EIF4G2,...,RABEP2,FKBP15,LRCH4,MEX3D,BCAN,ACTB,GAPDH,MIR3648-2,MIR3648-1,target
0,BM or PBMC,8.502866,8.373702,7.734063,8.849704,6.030234,9.436927,11.194272,11.41373,11.034516,...,6.318018,4.311254,9.898952,4.947833,7.450775,11.847398,12.378741,11.19781,10.794533,1
1,BM or PBMC,8.718331,7.575922,7.759872,8.802767,5.721242,9.589008,10.64447,11.891917,11.758333,...,5.940752,4.657384,9.895736,5.338514,7.531307,10.63164,11.954669,5.129696,5.253522,1
2,BM or PBMC,8.678028,9.043072,8.01178,8.954119,5.805235,9.504022,9.481949,10.464358,11.621595,...,5.5026,4.561471,8.999844,4.939722,7.759986,8.819384,9.687341,10.701375,11.867577,1
3,BM or PBMC,8.634034,7.491861,7.949067,8.609893,5.574527,10.391837,9.367462,9.962585,11.952126,...,4.952551,4.105074,9.451515,4.867055,7.653945,9.759826,9.49789,6.904912,7.493796,1
4,BM or PBMC,8.731049,8.847501,7.860008,8.953425,5.833827,10.307257,10.556192,11.613941,11.660103,...,6.481634,4.099558,10.389375,5.406069,7.478153,10.177903,11.639419,6.279039,6.940435,1


### Model creation and tuning

In [8]:
X = dataset.drop("target", axis=1)
y = dataset["target"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=random_state)

In [22]:
start_time = time.time()

model = CatBoostClassifier(iterations=200,
                       depth=4,
                       learning_rate=0.05,
                       loss_function='Logloss',
                       verbose=False)

# initialize Pool
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=["Tissue"])
    # initialize Pool
test_pool = Pool(X_val, 
                  y_val, 
                  cat_features=["Tissue"])

model.fit(train_pool)
print("--- %s seconds ---" % (time.time() - start_time))

--- 57.3713002204895 seconds ---


In [23]:
cm = confusion_matrix(y_val, model.predict(X_val))
Total = sum(sum(cm))
ROC = round(roc_auc_score(y_val, model.predict(X_val)),4)
sensitvity = round(cm[1,1]/(cm[1,1]+cm[0,1]),4)
specificity = round(cm[0,0]/(cm[1,0]+cm[0,0]),4)
F1 = round(f1_score(y_val, model.predict(X_val)),4)

print(ROC, sensitvity, specificity, F1)

0.991 0.9965 1.0 0.9982


In [24]:
LossFunctionChange = CatBoostClassifier.get_feature_importance(model, type="LossFunctionChange",data=test_pool)
ind_LossFunctionChange = np.argpartition(LossFunctionChange, -100)[-100:]

In [25]:
PredictionValuesChange = CatBoostClassifier.get_feature_importance(model)
ind_PredictionValuesChange = np.argpartition(PredictionValuesChange, -100)[-100:]

In [26]:
(np.intersect1d(ind_PredictionValuesChange, ind_LossFunctionChange))

array([    2,   333,   580,   604,   880,   968,  1097,  1128,  1247,
        1590,  1813,  1909,  2021,  2212,  2232,  2468,  2530,  2684,
        3085,  3138,  3202,  3224,  3261,  3474,  3551,  3592,  3837,
        4020,  4032,  4061,  4471,  4709,  4730,  4894,  4895,  4964,
        5550,  5676,  6203,  6252,  6437,  6570,  6860,  7015,  7254,
        7386,  8038,  8070,  8488,  8568,  8782,  8940,  9572,  9871,
       10004, 10379, 10440, 10874, 11054, 11900, 12291, 12395, 12624,
       12643], dtype=int64)

In [27]:
X_train.columns[0]

'Tissue'

In [28]:
X_train.columns[(np.intersect1d(ind_PredictionValuesChange, ind_LossFunctionChange))]

Index(['CCL5', 'RCN1', 'TGFBI', 'KDM5B', 'NID1', 'NUP214', 'MXI1', 'IFI27',
       'EPS8', 'KIAA0040', 'RTN1', 'IGF1R', 'SEMA3C', 'PRAME', 'CD48', 'TIE1',
       'APOC4-APOC2', 'MYB', 'HOXB6', 'MFAP3L', 'GEMIN4', 'PCSK5', 'ANGPT1',
       'CCNA1', 'ITGA9', 'WT1', 'PF4', 'GP1BB', 'FLT3', 'EPB41L3', 'DEFA4',
       'BLNK', 'ENTPD1', 'CD8B2', 'CITED2', 'REXO5', 'KLF5', 'SYNE1', 'RUNX1',
       'IGHV5-78', 'ASAH1', 'GABARAPL1', 'ALDH1A1', 'LCN2', 'CBX7', 'HOXA10',
       'CHRM3', 'HOXA1', 'TRAV13-2', 'TRAV21', 'TRBV21-1', 'TRAV13-1', 'AKTIP',
       'CAVIN2', 'MARC1', 'C3orf14', 'CXorf57', 'NUDT11', 'VPREB3', 'BACH2',
       'SHTN1', 'SMIM27', 'TCL1A', 'RUBCNL'],
      dtype='object')

In [29]:
X_train_v2 = X_train[X_train.columns[(np.intersect1d(ind_PredictionValuesChange, ind_LossFunctionChange))]]
X_val_v2 = X_val[X_train.columns[(np.intersect1d(ind_PredictionValuesChange, ind_LossFunctionChange))]]

In [30]:
X_train_v2

Unnamed: 0,CCL5,RCN1,TGFBI,KDM5B,NID1,NUP214,MXI1,IFI27,EPS8,KIAA0040,...,MARC1,C3orf14,CXorf57,NUDT11,VPREB3,BACH2,SHTN1,SMIM27,TCL1A,RUBCNL
2274,9.444250,8.232470,4.565620,9.635260,3.273770,8.489350,9.616230,5.815060,3.237250,6.135360,...,5.485380,3.625890,3.776940,5.370990,4.230150,6.591700,3.991860,7.589970,5.462690,3.639080
1140,11.044500,6.360320,11.405200,7.044660,5.742320,9.556920,10.275300,4.507380,5.829170,9.347570,...,7.926970,6.223050,5.325490,4.692140,9.513750,9.934680,8.179270,8.996720,11.879300,10.655800
120,5.780686,8.128647,7.211902,7.812987,5.007592,8.323652,8.562295,5.798277,4.942906,7.219205,...,8.148981,6.160203,5.907609,8.430341,9.991777,7.964883,4.947820,8.419831,6.886417,6.337238
3118,9.453780,8.069600,6.425570,9.881400,5.394140,8.406230,10.289700,6.908440,4.266850,5.273210,...,7.303970,3.259290,3.844170,4.288530,4.591830,3.926360,4.776190,8.601850,4.716950,4.488570
3915,7.485460,5.469310,5.684390,8.444690,3.127940,7.445250,9.334540,6.883170,8.179050,5.531460,...,6.402090,3.573040,3.557610,5.166370,4.513870,4.465640,3.632390,8.064630,5.727480,3.661840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,10.928600,7.881700,6.110730,8.900260,8.299260,9.740700,7.744450,4.837440,5.748210,5.990930,...,6.430800,4.132250,3.611590,4.110370,4.537540,6.323120,5.256860,8.588850,4.802280,6.782270
2763,6.741650,8.679110,5.441930,9.741580,3.382840,8.591740,10.596400,4.686750,4.814710,5.957020,...,8.538310,4.061170,3.874530,5.121270,4.247700,5.051170,8.558170,7.444150,4.593420,6.045950
905,5.807818,8.293590,6.537287,9.889042,6.185434,8.949663,9.427900,6.502468,4.992735,6.347506,...,9.513834,5.029487,4.568148,7.169569,6.086298,4.910022,4.425835,8.633329,6.773750,4.354213
3980,6.923750,8.711890,8.895860,9.552390,3.700880,9.207440,11.108400,5.138030,5.837200,6.458020,...,8.504610,3.901650,4.114550,4.114010,4.114010,4.976750,7.601670,8.370780,4.684330,3.742590


In [31]:
start_time = time.time()

model = CatBoostClassifier(iterations=200,
                       depth=6,
                       learning_rate=0.05,
                       loss_function='Logloss',
                       verbose=False)

# initialize Pool
train_pool = Pool(X_train_v2, 
                  y_train)
    # initialize Pool
test_pool = Pool(X_val_v2, 
                  y_val)

model.fit(train_pool)
print("--- %s seconds ---" % (time.time() - start_time))
pred = model.predict(X_val_v2)
print(f1_score(pred, y_val))

--- 1.2460689544677734 seconds ---
0.9994086339444116


In [33]:
df["Disease"].value_counts()

AML        4082
healthy     907
AMKL         63
Name: Disease, dtype: int64