In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.set_option('display.max_columns', 400)

In [8]:
import os
os.chdir('..')

In [12]:
datasets = ['CTNNB1', 'Erk', 'IRF4', 'Ikk2', 'Jnk', 'LEF1']
featuresets = ['Raw', 'Di', 'Dc']
treatments = ['BCR', 'CD40', 'All']

In [13]:
from pathlib import Path
for dataset in datasets:
    for featureset in featuresets:
        for treatment in treatments:
            file = f'{dataset}-{featureset}-{treatment}'
            if Path(f'Results/Previous/{file}.txt').exists() and \
               Path(f'Results/Results/{file}.txt').exists():
                print(f'{file} found in Both')
            elif Path(f'Results/Previous/{file}.txt').exists():
                print(f'{file} found in Previous')
            elif Path(f'Results/Results/{file}.txt').exists():
                print(f'{file} found in Results')
            else:
                print(f'{file} not found')

CTNNB1-Raw-BCR found in Previous
CTNNB1-Raw-CD40 found in Previous
CTNNB1-Raw-All found in Results
CTNNB1-Di-BCR found in Results
CTNNB1-Di-CD40 found in Previous
CTNNB1-Di-All found in Previous
CTNNB1-Dc-BCR found in Previous
CTNNB1-Dc-CD40 found in Previous
CTNNB1-Dc-All found in Previous
Erk-Raw-BCR found in Results
Erk-Raw-CD40 found in Results
Erk-Raw-All found in Previous
Erk-Di-BCR found in Previous
Erk-Di-CD40 found in Previous
Erk-Di-All found in Previous
Erk-Dc-BCR found in Previous
Erk-Dc-CD40 found in Previous
Erk-Dc-All found in Previous
IRF4-Raw-BCR found in Results
IRF4-Raw-CD40 found in Results
IRF4-Raw-All found in Results
IRF4-Di-BCR found in Results
IRF4-Di-CD40 found in Results
IRF4-Di-All found in Results
IRF4-Dc-BCR found in Results
IRF4-Dc-CD40 found in Results
IRF4-Dc-All found in Results
Ikk2-Raw-BCR found in Results
Ikk2-Raw-CD40 found in Results
Ikk2-Raw-All found in Results
Ikk2-Di-BCR found in Results
Ikk2-Di-CD40 found in Results
Ikk2-Di-All found in Resul

## Previous

In [14]:
file = 'Results/Previous/CTNNB1-Raw-BCR.txt'

In [15]:
with open(file, 'r') as f:
    lines = f.readlines()

In [131]:
%%time
from tqdm import tqdm_notebook
run, feature_ranker, classifier, num_features, bag = None, None, None, None, None 
means, ensembles, results = [], [], []
for line in tqdm_notebook(lines[2:], total=len(lines)-2):
    tokens = [t.strip() for t in line.split('|')]
    if len(tokens) == 3:
        tpr, tnr, gmean = [float(t.split(':')[-1]) for t in tokens]
        means += [{
            'Run': run, 'FeatureRanker': feature_ranker, 'Classifier': classifier,
            'NumFeatures': num_features, 'Num Bags': np.nan, 'Mean TPR': tpr,
            'Mean TNR': tnr, 'Mean GMean': gmean
        }]
    elif len(tokens) == 12:
        run, feature_ranker, classifier = int(tokens[0].split(':')[-1]), tokens[1], tokens[2]
        num_features, bag = int(tokens[3].split(':')[-1]), tokens[4]
        tpr, tnr, gmean = [float(t.split(':')[-1]) for t in tokens[5:8]]
        tp, tn, fp, fn = [int(t.split(':')[-1]) for t in tokens[8:]]
        payload = {
            'Run': run, 'FeatureRanker': feature_ranker, 'Classifier': classifier,
            'NumFeatures': num_features, 'TPR': tpr,
            'TNR': tnr, 'GMean': gmean, 'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn
        }
        if 'Ensemble' in bag:
            ensembles += [{**payload, 'Num Bags': int(bag.split()[2])}]
        else:
            results += [{**payload, 'Bag': int(bag.split(':')[-1])}]


CPU times: user 2.06 s, sys: 12 ms, total: 2.07 s
Wall time: 2.06 s


In [132]:
len(lines), len(means), len(ensembles), len(results)

(191252, 1875, 1875, 187500)

In [133]:
results = pd.DataFrame(results, columns=['Run', 'FeatureRanker', 'Classifier', 
                                'NumFeatures', 'Bag', 'TPR', 'TNR',
                                'GMean', 'TP', 'TN', 'FP', 'FN'])
ensembles = pd.DataFrame(ensembles, columns=['Run', 'FeatureRanker', 'Classifier', 
                                  'NumFeatures', 'Num Bags', 'TPR', 'TNR',
                                  'GMean', 'TP', 'TN', 'FP', 'FN'])
means = pd.DataFrame(means, columns=['Run', 'FeatureRanker', 'Classifier',
                              'NumFeatures', 'Num Bags', 'Mean TPR', 
                              'Mean TNR', 'Mean GMean'])

In [134]:
results.sample(5)

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN
86067,3,Ttest,LGBMClassifier,12,267,0.0667,0.9375,0.25,2,38380,2558,28
184642,5,Brattacharyya,KNeighborsClassifier,8,142,0.2,0.8164,0.4041,6,33421,7517,24
121729,4,Entropy,LGBMClassifier,12,229,0.2,0.8612,0.415,6,35256,5682,24
94598,3,Brattacharyya,SVC,8,98,0.4,0.5774,0.4806,12,23636,17302,18
3427,1,Ttest,RandomForestClassifier,12,127,0.0333,0.9698,0.1798,1,39701,1237,29


In [135]:
ensembles.sample(5)

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN
153,1,Entropy,SVC,4,100,0.6667,0.4082,0.5217,20,16710,24228,10
1687,5,Brattacharyya,SVC,2,200,0.6667,0.3333,0.4714,20,13646,27292,10
428,2,Brattacharyya,RandomForestClassifier,12,300,0.0,0.9625,0.0,0,39403,1535,30
447,2,Wilcoxon,RandomForestClassifier,18,100,0.1667,0.8828,0.3836,5,36139,4799,25
1509,5,Entropy,RandomForestClassifier,8,100,0.1667,0.8596,0.3785,5,35190,5748,25


In [137]:
means.sample(5)

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,Mean TPR,Mean TNR,Mean GMean
1406,4,Wilcoxon,LogisticRegression,2,,0.6667,0.3333,0.4714
916,3,Entropy,SVC,12,,0.589,0.4902,0.5219
1365,4,Entropy,LogisticRegression,12,,0.1377,0.8417,0.336
917,3,Entropy,SVC,12,,0.6012,0.4851,0.5255
1538,5,Brattacharyya,RandomForestClassifier,2,,0.1467,0.8533,0.2011


In [138]:
results.GMean.describe()

count    187500.000000
mean          0.387097
std           0.140136
min           0.000000
25%           0.332100
50%           0.427800
75%           0.471400
max           0.719000
Name: GMean, dtype: float64

In [140]:
ensembles.GMean.describe()

count    1875.000000
mean        0.374675
std         0.138750
min         0.000000
25%         0.300800
50%         0.398100
75%         0.471400
max         0.683400
Name: GMean, dtype: float64

## Results

In [141]:
file = 'Results/Results/CTNNB1-Raw-All.txt'

In [142]:
with open(file, 'r') as f:
    lines = f.readlines()

In [143]:
lens = [len(line.split('|')) for line in lines]

In [145]:
pd.Series(lens).value_counts()

13    12000
12     1000
3       500
4         2
dtype: int64

In [None]:
def preprocess_()

In [146]:
%%time
from tqdm import tqdm_notebook
run, feature_ranker, classifier, num_features, bag = None, None, None, None, None 
means, ensembles, results = [], [], []
for line in tqdm_notebook(lines[2:], total=len(lines)-2):
    tokens = [t.strip() for t in line.split('|')]
    if len(tokens) == 3:
        tpr, tnr, gmean = [float(t.split(':')[-1]) for t in tokens]
        means += [{
            'Run': run, 'FeatureRanker': feature_ranker, 'Classifier': classifier,
            'NumFeatures': num_features, 'Num Bags': np.nan, 'Mean TPR': tpr,
            'Mean TNR': tnr, 'Mean GMean': gmean
        }]
    elif len(tokens) >= 12:
        if len(tokens) == 13:
            tokens = tokens[1:]
        run, feature_ranker, classifier = int(tokens[0].split(':')[-1]), tokens[1], tokens[2]
        num_features, bag = int(tokens[3].split(':')[-1]), tokens[4]
        tpr, tnr, gmean = [float(t.split(':')[-1]) for t in tokens[5:8]]
        tp, tn, fp, fn = [int(t.split(':')[-1]) for t in tokens[8:]]
        payload = {
            'Run': run, 'FeatureRanker': feature_ranker, 'Classifier': classifier,
            'NumFeatures': num_features, 'TPR': tpr,
            'TNR': tnr, 'GMean': gmean, 'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn
        }
        if 'Ensemble' in bag:
            ensembles += [{**payload, 'Num Bags': int(bag.split()[2])}]
        else:
            results += [{**payload, 'Bag': int(bag.split(':')[-1])}]


CPU times: user 192 ms, sys: 0 ns, total: 192 ms
Wall time: 192 ms


In [147]:
results = pd.DataFrame(results, columns=['Run', 'FeatureRanker', 'Classifier', 
                                'NumFeatures', 'Bag', 'TPR', 'TNR',
                                'GMean', 'TP', 'TN', 'FP', 'FN'])
ensembles = pd.DataFrame(ensembles, columns=['Run', 'FeatureRanker', 'Classifier', 
                                  'NumFeatures', 'Num Bags', 'TPR', 'TNR',
                                  'GMean', 'TP', 'TN', 'FP', 'FN'])
means = pd.DataFrame(means, columns=['Run', 'FeatureRanker', 'Classifier',
                              'NumFeatures', 'Num Bags', 'Mean TPR', 
                              'Mean TNR', 'Mean GMean'])

In [152]:
len(lines), len(means), len(ensembles), len(results)

(13502, 500, 500, 12500)

In [149]:
results.sample(5)

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Bag,TPR,TNR,GMean,TP,TN,FP,FN
3034,1,Entropy,LGBMClassifier,12,34,0.55,0.5856,0.5675,33,47950,33926,27
4401,1,Wilcoxon,LGBMClassifier,4,1,0.3333,0.6667,0.4714,20,54584,27292,40
457,1,Entropy,RandomForestClassifier,10,57,0.1833,0.8982,0.4058,11,73545,8331,49
10302,1,Entropy,KNeighborsClassifier,8,2,0.5833,0.4641,0.5203,35,37997,43879,25
7848,1,Entropy,LogisticRegression,8,48,0.4667,0.5144,0.49,28,42120,39756,32


In [150]:
ensembles.sample(5)

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,TPR,TNR,GMean,TP,TN,FP,FN
420,1,Entropy,KNeighborsClassifier,12,25,0.8,0.3735,0.5466,48,30583,51293,12
188,1,Wilcoxon,LGBMClassifier,10,25,0.4333,0.6066,0.5127,26,49668,32208,34
324,1,Ttest,LogisticRegression,2,25,0.6667,0.3333,0.4714,40,27292,54584,20
2,1,Entropy,RandomForestClassifier,2,75,0.4833,0.5771,0.5282,29,47254,34622,31
495,1,Wilcoxon,KNeighborsClassifier,12,100,0.7167,0.3766,0.5195,43,30838,51038,17


In [151]:
means.sample(5)

Unnamed: 0,Run,FeatureRanker,Classifier,NumFeatures,Num Bags,Mean TPR,Mean TNR,Mean GMean
333,1,Ttest,LogisticRegression,6,,0.4377,0.5307,0.4751
375,1,Wilcoxon,LogisticRegression,2,,0.665,0.335,0.4704
384,1,Wilcoxon,LogisticRegression,8,,0.5533,0.4403,0.4909
304,1,Entropy,LogisticRegression,4,,0.5453,0.4376,0.4853
9,1,Entropy,RandomForestClassifier,6,,0.0937,0.9163,0.2836
