In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.set_option('display.max_columns', 400)

import os
os.chdir('..')

In [2]:
from SVMExperiments.utils import load_result, list_files, get_dataframe

In [36]:
treatments = ['BCR', 'CD40', 'All']

In [48]:
def join_results(dataset, featureset, treatment, folders):
    res_cols = ['Run', 'FeatureRanker', 'Classifier', 'NumFeatures', 'Bag']
    ens_cols = ['Run', 'FeatureRanker', 'Classifier', 'NumFeatures', 'Num Bags']
    results, ensembles = [], []
    for folder in folders:
        res, ens, _ = load_result(dataset, featureset, treatment, folder)
        results.append(res)
        ensembles.append(ens)
    return pd.concat(results).drop_duplicates(res_cols).reset_index(drop=True), \
           pd.concat(ensembles).drop_duplicates(ens_cols).reset_index(drop=True)

In [59]:
def join_inhibitor(dataset, featureset, treatments, folders_list):
    res_cols = ['Run', 'FeatureRanker', 'Classifier', 'NumFeatures', 'Bag', 'Treatment']
    ens_cols = ['Run', 'FeatureRanker', 'Classifier', 'NumFeatures', 'Num Bags', 'Treatment']
    results, ensembles = [], []
    for treatment, folders in zip(treatments, folders_list):
        res, ens = join_results(dataset, featureset, treatment, folders)
        res['Treatment'] = treatment
        ens['Treatment'] = treatment
        results.append(res)
        ensembles.append(ens)
    return pd.concat(results).drop_duplicates(res_cols).reset_index(drop=True), \
           pd.concat(ensembles).drop_duplicates(ens_cols).reset_index(drop=True)    

## CTNNB1

In [64]:
results, ensembles = join_inhibitor('CTNNB1', 'Tt', treatments, [
    ['Results/Tt', 'Results/TtV2'],
    ['Results/Tt'],
    ['Results/Tt', 'Results/TtV2']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [78]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [74]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
6719,1,Brattacharyya,SVC,12,19,0.6333,0.7387,0.684,19,30239,10699,11,BCR
10552,1,Entropy,KNeighborsClassifier,12,52,0.6667,0.6857,0.6761,20,28073,12865,10,BCR
11791,1,Brattacharyya,KNeighborsClassifier,12,91,0.7667,0.5877,0.6712,23,24058,16880,7,BCR
2977,1,Entropy,LGBMClassifier,10,77,0.5667,0.7859,0.6673,17,32172,8766,13,BCR
1789,1,Brattacharyya,RandomForestClassifier,12,89,0.5667,0.78,0.6648,17,31930,9008,13,BCR


In [75]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
269,1,Brattacharyya,SVC,12,50,0.5667,0.7276,0.6421,17,29787,11151,13,BCR
270,1,Brattacharyya,SVC,12,75,0.5667,0.7235,0.6403,17,29617,11321,13,BCR
271,1,Brattacharyya,SVC,12,100,0.5667,0.723,0.6401,17,29599,11339,13,BCR
268,1,Brattacharyya,SVC,12,25,0.5667,0.7224,0.6398,17,29573,11365,13,BCR
488,1,Wilcoxon,KNeighborsClassifier,10,25,0.7667,0.5133,0.6273,23,21013,19925,7,BCR


In [80]:
results.to_csv('Results/CSVs/Tt/CTNNB1.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/CTNNB1_Ensemble.csv', index=False)

## Erk

In [81]:
results, ensembles = join_inhibitor('Erk', 'Tt', treatments, [
    ['Results/Tt'],
    ['Results/Tt'],
    ['Results/Tt', 'Results/TtV2']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [82]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [83]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
1726,1,Brattacharyya,RandomForestClassifier,12,26,0.6548,0.6455,0.6501,55,45300,24879,29,BCR
4230,1,Brattacharyya,LGBMClassifier,12,30,0.6786,0.6181,0.6476,57,43379,26800,27,BCR
4721,1,Wilcoxon,LGBMClassifier,10,21,0.7262,0.5758,0.6466,61,40410,29769,23,BCR
3521,1,Ttest,LGBMClassifier,10,21,0.7262,0.5758,0.6466,61,40410,29769,23,BCR
4221,1,Brattacharyya,LGBMClassifier,12,21,0.7024,0.5939,0.6459,59,41680,28499,25,BCR


In [84]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
171,1,Brattacharyya,LGBMClassifier,12,100,0.6905,0.5959,0.6414,58,41817,28362,26,BCR
169,1,Brattacharyya,LGBMClassifier,12,50,0.6786,0.6004,0.6383,57,42135,28044,27,BCR
168,1,Brattacharyya,LGBMClassifier,12,25,0.6786,0.597,0.6365,57,41896,28283,27,BCR
68,1,Brattacharyya,RandomForestClassifier,12,25,0.6429,0.6231,0.6329,54,43729,26450,30,BCR
259,1,Brattacharyya,SVC,6,100,0.75,0.5304,0.6307,63,37224,32955,21,BCR


In [85]:
results.to_csv('Results/CSVs/Tt/Erk.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/Erk_Ensemble.csv', index=False)

## IRF4

In [86]:
results, ensembles = join_inhibitor('IRF4', 'Tt', treatments, [
    ['Results/Tt'],
    ['Results/Tt'],
    ['Results/Tt', 'Results/TtV2']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [87]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [88]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
23849,1,Brattacharyya,KNeighborsClassifier,4,49,0.6667,0.6892,0.6778,18,33281,15010,9,CD40
14879,1,Wilcoxon,RandomForestClassifier,12,79,0.7037,0.6154,0.6581,19,29720,18571,8,CD40
23285,1,Ttest,KNeighborsClassifier,4,85,0.6296,0.687,0.6577,17,33175,15116,10,CD40
23271,1,Ttest,KNeighborsClassifier,4,71,0.5556,0.7615,0.6504,15,36775,11516,12,CD40
23603,1,Ttest,KNeighborsClassifier,12,3,0.6667,0.6267,0.6464,18,30266,18025,9,CD40


In [89]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
1411,1,Entropy,KNeighborsClassifier,6,100,0.6852,0.5467,0.612,37,52803,43779,17,All
1410,1,Entropy,KNeighborsClassifier,6,75,0.7037,0.5169,0.6031,38,49928,46654,16,All
1397,1,AllFeatures,LogisticRegression,540,50,0.5741,0.6194,0.5963,31,59827,36755,23,All
1399,1,AllFeatures,LogisticRegression,540,100,0.5741,0.6158,0.5946,31,59477,37105,23,All
541,1,Ttest,RandomForestClassifier,10,50,0.5185,0.6718,0.5902,14,32440,15851,13,CD40


In [90]:
results.to_csv('Results/CSVs/Tt/IRF4.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/IRF4_Ensemble.csv', index=False)

## Ikk2

In [91]:
results, ensembles = join_inhibitor('Ikk2', 'Tt', treatments, [
    ['Results/Tt'],
    ['Results/Tt'],
    ['Results/Tt', 'Results/TtV2']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [92]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [93]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
20352,1,Entropy,LogisticRegression,8,52,0.6004,0.5991,0.5997,1286,31228,20900,856,CD40
20393,1,Entropy,LogisticRegression,8,93,0.6242,0.576,0.5996,1337,30027,22101,805,CD40
20389,1,Entropy,LogisticRegression,8,89,0.606,0.5922,0.5991,1298,30871,21257,844,CD40
20325,1,Entropy,LogisticRegression,8,25,0.6195,0.5794,0.5991,1327,30202,21926,815,CD40
20321,1,Entropy,LogisticRegression,8,21,0.6036,0.5943,0.5989,1293,30979,21149,849,CD40


In [94]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
813,1,Entropy,LogisticRegression,8,50,0.5901,0.6002,0.5951,1264,31286,20842,878,CD40
812,1,Entropy,LogisticRegression,8,25,0.5892,0.6005,0.5948,1262,31302,20826,880,CD40
814,1,Entropy,LogisticRegression,8,75,0.5966,0.5864,0.5915,1278,30566,21562,864,CD40
815,1,Entropy,LogisticRegression,8,100,0.5957,0.5869,0.5913,1276,30593,21535,866,CD40
816,1,Entropy,LogisticRegression,10,25,0.6036,0.5762,0.5898,1293,30036,22092,849,CD40


In [95]:
results.to_csv('Results/CSVs/Tt/Ikk2.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/Ikk2_Ensemble.csv', index=False)

## Jnk

In [96]:
results, ensembles = join_inhibitor('Jnk', 'Tt', treatments, [
    ['Results/Tt', 'Results/TtV2'],
    ['Results/Tt', 'Results/TtV2'],
    ['Results/Tt', 'Results']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [97]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [98]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
21275,1,Brattacharyya,LogisticRegression,2,75,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
21245,1,Brattacharyya,LogisticRegression,2,45,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
21248,1,Brattacharyya,LogisticRegression,2,48,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
21250,1,Brattacharyya,LogisticRegression,2,50,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
21251,1,Brattacharyya,LogisticRegression,2,51,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40


In [99]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
849,1,Brattacharyya,LogisticRegression,2,50,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
848,1,Brattacharyya,LogisticRegression,2,25,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
851,1,Brattacharyya,LogisticRegression,2,100,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
850,1,Brattacharyya,LogisticRegression,2,75,0.6445,0.5659,0.6039,2181,58231,44675,1203,CD40
810,1,Entropy,LogisticRegression,6,75,0.6126,0.5756,0.5938,2073,59236,43670,1311,CD40


In [100]:
results.to_csv('Results/CSVs/Tt/Jnk.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/Jnk_Ensemble.csv', index=False)

## LEF1

In [101]:
results, ensembles = join_inhibitor('LEF1', 'Tt', treatments, [
    ['Results/Tt', 'Results/TtV2'],
    ['Results/TtV2', 'Results/Tt'],
    ['Results/TtV2', 'Results', 'Results/Tt']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [102]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [103]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
19913,1,AllFeatures,SVC,540,13,0.7111,0.7265,0.7188,32,24645,9276,13,CD40
19906,1,AllFeatures,SVC,540,6,0.7111,0.7006,0.7058,32,23765,10156,13,CD40
19940,1,AllFeatures,SVC,540,40,0.6667,0.7255,0.6954,30,24608,9313,15,CD40
19946,1,AllFeatures,SVC,540,46,0.6444,0.7378,0.6896,29,25028,8893,16,CD40
19997,1,AllFeatures,SVC,540,97,0.6222,0.7613,0.6883,28,25824,8097,17,CD40


In [104]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
300,1,Entropy,LogisticRegression,2,25,0.7111,0.6553,0.6827,32,22230,11691,13,BCR
301,1,Entropy,LogisticRegression,2,50,0.7111,0.6553,0.6827,32,22230,11691,13,BCR
303,1,Entropy,LogisticRegression,2,100,0.7111,0.6553,0.6827,32,22230,11691,13,BCR
302,1,Entropy,LogisticRegression,2,75,0.7111,0.6553,0.6827,32,22230,11691,13,BCR
1356,1,Brattacharyya,LogisticRegression,6,25,0.8111,0.5523,0.6693,73,37468,30374,17,All


In [105]:
results.to_csv('Results/CSVs/Tt/LEF1.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/LEF1_Ensemble.csv', index=False)

## MYC

In [106]:
results, ensembles = join_inhibitor('MYC', 'Tt', treatments, [
    ['Results/TtV2', 'Results'],
    ['Results/TtV2'],
    ['Results/TtV2']
])
results.shape, ensembles.shape

((37500, 13), (1500, 13))

In [107]:
results.Classifier.value_counts()

RandomForestClassifier    7500
KNeighborsClassifier      7500
LGBMClassifier            7500
LogisticRegression        7500
SVC                       7500
Name: Classifier, dtype: int64

In [108]:
results.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
15547,1,Entropy,LGBMClassifier,12,47,0.6753,0.5796,0.6257,1789,16911,12264,860,CD40
15466,1,Entropy,LGBMClassifier,10,66,0.6463,0.6017,0.6236,1712,17556,11619,937,CD40
15572,1,Entropy,LGBMClassifier,12,72,0.6463,0.5984,0.6219,1712,17459,11716,937,CD40
16127,1,Ttest,LGBMClassifier,12,27,0.6425,0.6012,0.6215,1702,17541,11634,947,CD40
20135,1,Entropy,LogisticRegression,4,35,0.6485,0.5943,0.6208,1718,17339,11836,931,CD40


In [109]:
ensembles.sort_values('GMean', ascending=False).head()

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN,Treatment
647,1,Ttest,LGBMClassifier,12,100,0.5931,0.6495,0.6206,1571,18949,10226,1078,CD40
646,1,Ttest,LGBMClassifier,12,75,0.5817,0.6617,0.6204,1541,19306,9869,1108,CD40
645,1,Ttest,LGBMClassifier,12,50,0.5919,0.6494,0.62,1568,18947,10228,1081,CD40
618,1,Entropy,LGBMClassifier,10,75,0.698,0.5498,0.6195,1849,16041,13134,800,CD40
617,1,Entropy,LGBMClassifier,10,50,0.6867,0.558,0.619,1819,16280,12895,830,CD40


In [None]:
results.to_csv('Results/CSVs/Tt/MYC.csv', index=False)
ensembles.to_csv('Results/CSVs/Tt/MYC_Ensemble.csv', index=False)