In [4]:
import os.path
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm, trange
from sklearn.metrics import auc as auc_fn
from sklearn.linear_model import LinearRegression

from utilities import deserialize_xgboost_classifier_and_get_importance_gain

Constants.

In [6]:
ROOT = '.'
OUT_FOLDER = f'{ROOT}/results'
DATA_FOLDER = f'{ROOT}/data'
FILE = f"{DATA_FOLDER}/Supp. File 1.xlsx"
N_JOBS = -1

Read results from the custom oversampling.

In [None]:
if not os.path.exists(f'{DATA_FOLDER}/DILI_complete_oversampling_analysis2024_feature_importance_avg_gain_norm_across_folds.feather'):
    data = pd.read_feather(f'{DATA_FOLDER}/DILI_complete_oversampling_analysis2024.feather')

Extract feature importances from serialized models.

In [None]:
if not os.path.exists(f'{DATA_FOLDER}/DILI_complete_oversampling_analysis2024_feature_importance_avg_gain_norm_across_folds.feather'):
    pandarallel.initialize(progress_bar=True, verbose=False)
    important_features = pd.DataFrame.from_dict(data['model'].parallel_apply(deserialize_xgboost_classifier_and_get_importance_gain).tolist())
    important_features.insert(0, 'dataset', data['dataset'].reset_index(drop=True), True)
    important_features.insert(1, 'descriptors', data['descriptors'].reset_index(drop=True), True)
    important_features.insert(2, 'num_stds', data['num_stds'].reset_index(drop=True), True)
    important_features.insert(3, 'samples added', data['samples added'].reset_index(drop=True), True)

In [None]:
if not os.path.exists(f'{DATA_FOLDER}/DILI_complete_oversampling_analysis2024_feature_importance_avg_gain_norm_across_folds.feather'):
    # Ensure total gain of all features for each condition = 1
    for i in trange(0, len(important_features), 1000):
        important_features.iloc[i: i +1000, 4:] = important_features.iloc[i: i +1000, 4:].div(important_features.iloc[i: i +1000, 4:].sum(axis=1), axis=0)
    
    important_features.to_feather(f'{OUT_FOLDER}/DILI_complete_oversampling_analysis2024_feature_importance_avg_gain_norm.feather')
    # Sum across 5-folds
    important_features = important_features.groupby(['dataset', 'descriptors', 'num_stds', 'samples added'], as_index=False).sum()
    # Ensure total gain is normalized per condition (their sums are 1)
    important_features.iloc[:, 4:] = important_features.iloc[:, 4:].div(important_features.iloc[:, 4:].sum(axis=1), axis=0)
    # Save to disk.
    important_features.to_feather(f'{DATA_FOLDER}/DILI_complete_oversampling_analysis2024_feature_importance_avg_gain_norm_across_folds.feather')

In [7]:
important_features = pd.read_feather(
    f'{DATA_FOLDER}/DILI_complete_oversampling_analysis2024_feature_importance_avg_gain_norm_across_folds.feather')
# Melt and ignore NAs summed to 0
data = important_features.melt(id_vars=['dataset', 'descriptors', 'num_stds', 'samples added']).query('value != 0')

Separate feature per type.

In [8]:
bio_descs = data[data.variable.str.split('_').str[0].isin(['bip', 'btg2', 'chop', 'hmox1', 'hspa1b', 'icam1', 'p21', 'srxn1'])].variable.drop_duplicates()
chem_descs = data[~data.variable.str.startswith('ECFP_6') & ~data.variable.str.split('_').str[0].isin(['bip', 'btg2', 'chop', 'hmox1', 'hspa1b', 'icam1', 'p21', 'srxn1']) & ~data.variable.str.endswith('_WT') & ~data.variable.str.contains('^.+(?:_[ACDEFGHIKLMNPQRSTVWY]\d+[ACDEFGHIKLMNPQRSTVWY])+$')].variable.drop_duplicates()
chemfp_descs = data[data.variable.str.startswith('ECFP_6')].variable.drop_duplicates()
allchem_descs = pd.concat((chem_descs, chemfp_descs)).drop_duplicates()
biospectra_descs = data[data.variable.str.endswith('_WT') & ~data.variable.str.contains('^.+(?:_[ACDEFGHIKLMNPQRSTVWY]\d+[ACDEFGHIKLMNPQRSTVWY])+$')].variable.drop_duplicates()

bip_descs = data[data.variable.str.startswith('bip')].variable.drop_duplicates()
btg2_descs = data[data.variable.str.startswith('btg2')].variable.drop_duplicates()
chop_descs = data[data.variable.str.startswith('chop')].variable.drop_duplicates()
hmox1_descs = data[data.variable.str.startswith('hmox1')].variable.drop_duplicates()
hspa1b_descs = data[data.variable.str.startswith('hspa1b')].variable.drop_duplicates()
icam1_descs = data[data.variable.str.startswith('icam1')].variable.drop_duplicates()
p21_descs = data[data.variable.str.startswith('p21')].variable.drop_duplicates()
srxn1_descs = data[data.variable.str.startswith('srxn1')].variable.drop_duplicates()

cmax1_descs = data[data.variable.str.split('_').str[1] == 'cmax1'].variable.drop_duplicates()
cmax5_descs = data[data.variable.str.split('_').str[1] == 'cmax5'].variable.drop_duplicates()
cmax10_descs = data[data.variable.str.split('_').str[1] == 'cmax10'].variable.drop_duplicates()
cmax25_descs = data[data.variable.str.split('_').str[1] == 'cmax25'].variable.drop_duplicates()
cmax50_descs = data[data.variable.str.split('_').str[1] == 'cmax50'].variable.drop_duplicates()
cmax100_descs = data[data.variable.str.split('_').str[1] == 'cmax100'].variable.drop_duplicates()

tp24 = data[data.variable.str.split('_').str[2] == 'tp24'].variable.drop_duplicates()
tp48 = data[data.variable.str.split('_').str[2] == 'tp48'].variable.drop_duplicates()
tp72 = data[data.variable.str.split('_').str[2] == 'tp72'].variable.drop_duplicates()

Obtain the feature importance per feature type.

In [9]:
features = [('bio', bio_descs), ('allchem', allchem_descs), ('chem', chem_descs), ('chemfp', chemfp_descs), ('biospectra', biospectra_descs),
            ('bip', bip_descs), ('btg2', btg2_descs), ('chop', chop_descs), ('hmox1', hmox1_descs), ('hspa1b', hspa1b_descs), ('icam1', icam1_descs), ('p21', p21_descs), ('srxn1', srxn1_descs),
            ('cmax1', cmax1_descs), ('cmax5', cmax5_descs), ('cmax10', cmax10_descs), ('cmax25', cmax25_descs), ('cmax50', cmax50_descs), ('cmax100', cmax100_descs),
            ('24h', tp24), ('48h', tp48), ('72h', tp72)]

table1 = []
for dataset in tqdm(['All', 'GFP', 'AnV', 'PI' , 'AnV-PI', 'GFP-AnV-PI',
                     'IntegratedIntensity', 'MeanIntensity', 'NormCounts']):
    for descriptors in ['bio', 'biochem', 'biospectra']:
        for num_stds in range(1, 6):
            x = data.query(f'dataset == "{dataset}" and descriptors == "{descriptors}" and num_stds == {num_stds}', engine='python')
            results1 = {}
            results2 = {}
            for feature_name, feature_list in features:
                values = x.query(f'variable in @feature_list', engine='python').groupby('samples added', as_index=False).sum()[['samples added', 'value']]
                if values.empty:
                    results1[feature_name] = None
                    results2[feature_name] = None
                else:
                    auc = auc_fn(values['samples added']/250, values['value'])
                    slope = LinearRegression().fit(values['samples added'].values.reshape(-1, 1), values['value']).coef_.item()
                    results1[feature_name] = auc
                    results2[feature_name] = slope
            results1 = pd.DataFrame([results1])
            results1.insert(0, 'dataset', dataset)
            results1.insert(1, 'descriptors', descriptors)
            results1.insert(2, 'num_stds', num_stds)
            results2 = pd.DataFrame([results2])
            results2.insert(0, 'dataset', dataset)
            results2.insert(1, 'descriptors', descriptors)
            results2.insert(2, 'num_stds', num_stds)
            table1.append(results1)

table1 = pd.concat(table1)

  0%|          | 0/9 [00:00<?, ?it/s]

Obtain Figure 4A, B, and C.

In [10]:
table1.query('dataset == "All" and num_stds == 1').drop(columns=['dataset', 'num_stds']).groupby('descriptors').agg('mean').round(4).T

descriptors,bio,biochem,biospectra
bio,1.0,0.9859,0.8531
allchem,,0.0141,0.0042
chem,,0.0053,0.0009
chemfp,,0.0088,0.0034
biospectra,,,0.1425
bip,0.0896,0.0892,0.0701
btg2,0.1146,0.1099,0.0975
chop,0.1212,0.12,0.1115
hmox1,0.1116,0.1123,0.095
hspa1b,0.0717,0.0686,0.0569


Obtain first part of Figure 4D. 

In [11]:
table3 = []
dataset  = 'All'
for descriptors in tqdm(['bio', 'biochem', 'biospectra']):
    for num_stds in range(1, 6):
        x = data.query(f'dataset == "{dataset}" and descriptors == "{descriptors}" and num_stds == {num_stds}', engine='python')
        results1 = {}
        results2 = {}
        for feature_name in chem_descs:
            values = x.query(f'variable == "{feature_name}"', engine='python')[['samples added', 'value']]
            if values.empty or values.shape[0] == 1:
                results1[feature_name] = None
            else:
                auc = auc_fn(values['samples added']/250, values['value'])
                slope = LinearRegression().fit(values['samples added'].values.reshape(-1, 1), values['value']).coef_.item()
                results1[feature_name] = auc
                results2[feature_name] = slope
        results1 = pd.DataFrame([results1])
        results1.insert(0, 'dataset', dataset)
        results1.insert(1, 'descriptors', descriptors)
        results1.insert(2, 'num_stds', num_stds)
        table3.append(results1)
        results2 = pd.DataFrame([results2])
        results2.insert(0, 'dataset', dataset)
        results2.insert(1, 'descriptors', descriptors)
        results2.insert(2, 'num_stds', num_stds)

table3 = pd.concat(table3)

  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
table3.query('dataset == "All" and num_stds == 1').drop(columns=['dataset', 'num_stds']).groupby('descriptors').agg('mean').round(4).T.sort_values('biochem', ascending=False)

descriptors,bio,biochem,biospectra
ES_Sum_sOH,,0.0015,0.0001
ALogP,,0.0011,0.0002
ES_Sum_sssCH,,0.0009,0.0001
ES_Sum_dssC,,0.0005,
ES_Sum_ssCH2,,0.0004,0.0
CHI_V_3_C,,0.0004,
ES_Sum_ddsN,,0.0004,0.0003
ES_Sum_sssN,,0.0002,
H_Count,,0.0002,
ES_Sum_dO,,0.0002,0.0


Obtain the second part of Figure 4D.

In [13]:
table5 = []
dataset  = 'All'
for descriptors in tqdm(['bio', 'biochem', 'biospectra']):
    for num_stds in range(1, 6):
        x = data.query(f'dataset == "{dataset}" and descriptors == "{descriptors}" and num_stds == {num_stds}', engine='python')
        results1 = {}
        results2 = {}
        for feature_name in chemfp_descs:
            values = x.query(f'variable == "{feature_name}"', engine='python')[['samples added', 'value']]
            if values.empty or values.shape[0] == 1:
                results1[feature_name] = None
            else:
                auc = auc_fn(values['samples added']/250, values['value'])
                slope = LinearRegression().fit(values['samples added'].values.reshape(-1, 1), values['value']).coef_.item()
                results1[feature_name] = auc
                results2[feature_name] = slope
        results1 = pd.DataFrame([results1])
        results1.insert(0, 'dataset', dataset)
        results1.insert(1, 'descriptors', descriptors)
        results1.insert(2, 'num_stds', num_stds)
        table5.append(results1)
        results2 = pd.DataFrame([results2])
        results2.insert(0, 'dataset', dataset)
        results2.insert(1, 'descriptors', descriptors)
        results2.insert(2, 'num_stds', num_stds)

table5 = pd.concat(table5)

  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
table5.query('dataset == "All" and num_stds == 1').drop(columns=['dataset', 'num_stds']).groupby('descriptors').agg(
    'mean').round(4).T.sort_values('biochem', ascending=False)

descriptors,bio,biochem,biospectra
ECFP_6_684,,0.0027,0.0009
ECFP_6_594,,0.0024,0.0004
ECFP_6_624,,0.0008,0.0006
ECFP_6_1584,,0.0008,0.0001
ECFP_6_299,,0.0005,
...,...,...,...
ECFP_6_1362,,,
ECFP_6_1663,,,
ECFP_6_168,,,
ECFP_6_283,,,


Obtain Figure 4E.

In [15]:
proteins = pd.read_csv(f'{DATA_FOLDER}/05.7_combined_set_protein_targets.tsv.xz', sep='\t')
human_proteins = proteins.query('Organism == "Homo sapiens (Human)"')
human_data = data.query('variable in @human_proteins.target_id')
data_organism = data.merge(proteins[['target_id', 'Organism', 'Classification']], left_on='variable', right_on='target_id')

In [16]:
table11 = []
dataset  = 'All'
for descriptors in tqdm(['bio', 'biochem', 'biospectra']):
    for num_stds in range(1, 6):
        x = data_organism.query(f'dataset == "{dataset}" and descriptors == "{descriptors}" and num_stds == {num_stds}', engine='python')
        results1 = {}
        results2 = {}
        for organism in data_organism.Organism.unique():
            tmp_org_proteins = data_organism.query('Organism  == "' + organism + '"').target_id
            values = x.query(f'variable in @tmp_org_proteins', engine='python').groupby('samples added', as_index=False).sum()[['samples added', 'value']]
            if values.empty or values.shape[0] == 1:
                results1[organism] = None
            else:
                auc = auc_fn(values['samples added']/250, values['value'])
                slope = LinearRegression().fit(values['samples added'].values.reshape(-1, 1), values['value']).coef_.item()
                results1[organism] = auc
                results2[organism] = slope
        results1 = pd.DataFrame([results1])
        results1.insert(0, 'dataset', dataset)
        results1.insert(1, 'descriptors', descriptors)
        results1.insert(2, 'num_stds', num_stds)
        table11.append(results1)
        results2 = pd.DataFrame([results2])
        results2.insert(0, 'dataset', dataset)
        results2.insert(1, 'descriptors', descriptors)
        results2.insert(2, 'num_stds', num_stds)


table11 = pd.concat(table11)

  0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
table11.query('dataset == "All" and num_stds == 1').drop(columns=['dataset', 'num_stds']).groupby('descriptors').agg(
    'mean').round(4).T.sort_values('biospectra', ascending=False)

descriptors,bio,biochem,biospectra
Homo sapiens (Human),,,0.0607
Escherichia coli (strain K12),,,0.0188
Rattus norvegicus (Rat),,,0.0115
Mus musculus (Mouse),,,0.0112
Bacillus amyloliquefaciens (Bacillus velezensis),,,0.0052
...,...,...,...
Flaveria bidentis (Coastal plain yellowtops) (Ethulia bidentis),,,
Staphylococcus saprophyticus subsp. saprophyticus (strain ATCC 15305 / DSM 20229 / NCIMB 8711 / NCTC 7292 / S-41),,,
Norovirus Hu/GI/10360/2010/VNM,,,
Emericella nidulans (Aspergillus nidulans),,,


Obtain Figure 4F.

In [18]:
table9 = []
dataset  = 'All'
for descriptors in tqdm(['bio', 'biochem', 'biospectra']):
    for num_stds in range(1, 6):
        x = human_data.query(f'dataset == "{dataset}" and descriptors == "{descriptors}" and num_stds == {num_stds}', engine='python')
        results1 = {}
        results2 = {}
        for feature_name in human_data.variable.unique():
            values = x.query(f'variable == "{feature_name}"', engine='python')[['samples added', 'value']]
            if values.empty or values.shape[0] == 1:
                results1[feature_name] = None
            else:
                auc = auc_fn(values['samples added']/250, values['value'])
                slope = LinearRegression().fit(values['samples added'].values.reshape(-1, 1), values['value']).coef_.item()
                results1[feature_name] = auc
                results2[feature_name] = slope
        results1 = pd.DataFrame([results1])
        results1.insert(0, 'dataset', dataset)
        results1.insert(1, 'descriptors', descriptors)
        results1.insert(2, 'num_stds', num_stds)
        table9.append(results1)
        results2 = pd.DataFrame([results2])
        results2.insert(0, 'dataset', dataset)
        results2.insert(1, 'descriptors', descriptors)
        results2.insert(2, 'num_stds', num_stds)

table9 = pd.concat(table9)

  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
table9.query('dataset == "All" and num_stds == 1').drop(columns=['dataset', 'num_stds']).groupby('descriptors').agg(
    'mean').round(4).T.sort_values('biospectra', ascending=False)

descriptors,bio,biochem,biospectra
Q9BTE7_WT,,,0.0029
Q6R6M4_WT,,,0.0027
Q01469_WT,,,0.0025
P62081_WT,,,0.0019
P54577_WT,,,0.0019
...,...,...,...
O75388_WT,,,
P07203_WT,,,
P16455_WT,,,
Q15843_WT,,,


Obtain Supp. Tables 2 to 6.

In [20]:
ttperfs = pd.read_feather(fr'{OUT_FOLDER}/Analysis_DILI_num_samples_to_high_specificity.feather')

In [21]:
ttperfs.query('num_stds == 1')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,baseline,#samples Spe > 0.70,perf at Spe > 0.70,#samples Spe > 0.80,perf at Spe > 0.80,#samples Spe > 0.90,perf at Spe > 0.90,#samples Spe > 0.95,perf at Spe > 0.95
dataset,num_stds,descriptor,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
All,1,bio,MCC,0.062779,30,0.734304,40,0.792534,80,0.863169,110.0,0.896986
All,1,bio,BAcc,0.519000,30,0.838667,40,0.882990,80,0.929281,110.0,0.946933
All,1,bio,Sen,0.991333,30,0.968000,40,0.954667,80,0.944000,110.0,0.937333
All,1,bio,Spe,0.046667,30,0.709333,40,0.811313,80,0.914561,110.0,0.956533
All,1,biochem,MCC,0.048075,30,0.724414,40,0.790802,80,0.858846,110.0,0.905740
...,...,...,...,...,...,...,...,...,...,...,...,...
NumObjects,1,biochem,Spe,0.082222,30,0.722519,40,0.830909,60,0.913889,100.0,0.959275
NumObjects,1,biospectra,MCC,0.065587,60,0.597147,70,0.652399,90,0.745060,120.0,0.762189
NumObjects,1,biospectra,BAcc,0.520444,60,0.793250,70,0.824423,90,0.867834,120.0,0.867132
NumObjects,1,biospectra,Sen,0.992000,60,0.844667,70,0.826667,90,0.819333,120.0,0.774000


In [22]:
ttperfs.query('num_stds == 2')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,baseline,#samples Spe > 0.70,perf at Spe > 0.70,#samples Spe > 0.80,perf at Spe > 0.80,#samples Spe > 0.90,perf at Spe > 0.90,#samples Spe > 0.95,perf at Spe > 0.95
dataset,num_stds,descriptor,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
All,2,bio,MCC,0.062779,40,0.764027,50,0.803930,70,0.866073,100.0,0.910876
All,2,bio,BAcc,0.519000,40,0.861626,50,0.892659,70,0.929510,100.0,0.954244
All,2,bio,Sen,0.991333,40,0.961333,50,0.950667,70,0.956667,100.0,0.949333
All,2,bio,Spe,0.046667,40,0.761919,50,0.834652,70,0.902353,100.0,0.959155
All,2,biochem,MCC,0.048075,40,0.756279,50,0.810971,70,0.862942,100.0,0.904814
...,...,...,...,...,...,...,...,...,...,...,...,...
NumObjects,2,biochem,Spe,0.074444,30,0.703704,50,0.865275,70,0.910850,110.0,0.968923
NumObjects,2,biospectra,MCC,0.062884,60,0.616787,70,0.645934,90,0.756264,120.0,0.781161
NumObjects,2,biospectra,BAcc,0.519111,60,0.803472,70,0.819932,90,0.873511,120.0,0.877055
NumObjects,2,biospectra,Sen,0.992667,60,0.850667,70,0.822000,90,0.822000,120.0,0.790000


In [23]:
ttperfs.query('num_stds == 3')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,baseline,#samples Spe > 0.70,perf at Spe > 0.70,#samples Spe > 0.80,perf at Spe > 0.80,#samples Spe > 0.90,perf at Spe > 0.90,#samples Spe > 0.95,perf at Spe > 0.95
dataset,num_stds,descriptor,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
All,3,bio,MCC,0.062779,40,0.756405,60,0.812878,80,0.870406,100.0,0.902910
All,3,bio,BAcc,0.519000,40,0.855909,60,0.898472,80,0.932860,100.0,0.950222
All,3,bio,Sen,0.991333,40,0.966667,60,0.952000,80,0.954000,100.0,0.949333
All,3,bio,Spe,0.046667,40,0.745152,60,0.844944,80,0.911719,100.0,0.951111
All,3,biochem,MCC,0.048075,40,0.741977,60,0.821743,80,0.869050,100.0,0.904439
...,...,...,...,...,...,...,...,...,...,...,...,...
NumObjects,3,biochem,Spe,0.112222,40,0.757677,60,0.858000,70,0.905316,110.0,0.955508
NumObjects,3,biospectra,MCC,0.078894,60,0.638946,70,0.677174,100,0.754464,140.0,0.807737
NumObjects,3,biospectra,BAcc,0.525000,60,0.814139,70,0.836020,100,0.871348,140.0,0.889026
NumObjects,3,biospectra,Sen,0.990000,60,0.863333,70,0.840667,100,0.820667,140.0,0.806667


In [24]:
ttperfs.query('num_stds == 4')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,baseline,#samples Spe > 0.70,perf at Spe > 0.70,#samples Spe > 0.80,perf at Spe > 0.80,#samples Spe > 0.90,perf at Spe > 0.90,#samples Spe > 0.95,perf at Spe > 0.95
dataset,num_stds,descriptor,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
All,4,bio,MCC,0.062779,40,0.740390,60,0.801149,90,0.857274,110.0,0.898812
All,4,bio,BAcc,0.519000,40,0.846081,60,0.892111,90,0.926616,110.0,0.948733
All,4,bio,Sen,0.991333,40,0.966000,60,0.950667,90,0.950000,110.0,0.947333
All,4,bio,Spe,0.046667,40,0.726162,60,0.833556,90,0.903232,110.0,0.950133
All,4,biochem,MCC,0.048075,40,0.721714,60,0.805938,80,0.857681,110.0,0.902998
...,...,...,...,...,...,...,...,...,...,...,...,...
NumObjects,4,biochem,Spe,0.074444,40,0.746162,50,0.801905,90,0.910505,120.0,0.950564
NumObjects,4,biospectra,MCC,0.074795,50,0.639147,70,0.697724,100,0.764787,140.0,0.804896
NumObjects,4,biospectra,BAcc,0.522222,50,0.808256,70,0.846416,100,0.879205,140.0,0.891198
NumObjects,4,biospectra,Sen,0.986667,50,0.896000,70,0.856667,100,0.848000,140.0,0.824667


In [25]:
ttperfs.query('num_stds == 5')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,baseline,#samples Spe > 0.70,perf at Spe > 0.70,#samples Spe > 0.80,perf at Spe > 0.80,#samples Spe > 0.90,perf at Spe > 0.90,#samples Spe > 0.95,perf at Spe > 0.95
dataset,num_stds,descriptor,metric,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
All,5,bio,MCC,0.062779,40,0.737053,60,0.792553,100,0.892916,130.0,0.915869
All,5,bio,BAcc,0.519000,40,0.843970,60,0.886556,100,0.945539,130.0,0.957096
All,5,bio,Sen,0.991333,40,0.964000,60,0.952667,100,0.960667,130.0,0.948000
All,5,bio,Spe,0.046667,40,0.723939,60,0.820444,100,0.930411,130.0,0.966192
All,5,biochem,MCC,0.048075,40,0.736424,60,0.786188,100,0.883841,120.0,0.901973
...,...,...,...,...,...,...,...,...,...,...,...,...
NumObjects,5,biochem,Spe,0.063333,40,0.746061,60,0.818722,100,0.915024,140.0,0.961129
NumObjects,5,biospectra,MCC,0.048446,50,0.613655,70,0.693431,110,0.761733,160.0,0.822556
NumObjects,5,biospectra,BAcc,0.514556,50,0.797875,70,0.842582,110,0.876821,160.0,0.897757
NumObjects,5,biospectra,Sen,0.984667,50,0.880000,70,0.876667,110,0.849333,160.0,0.827333


Obtain Supp. Tables 7 to 10.

In [26]:
importances = ((tmp := data.query('num_stds == 1 and descriptors == "bio" and dataset == "All" and variable in @bio_descs'))
                           .assign(category=tmp.variable
                                               .str.split('_')
                                               .str[:3]
                                               .str.join ('_'))
                           .groupby(['category', 'samples added'])
                           .value.sum()
               )

importances_ = []
for name, group in importances.reset_index().groupby('category'):
    auc = auc_fn(group['samples added']/250, group['value'])
    importances_.append(pd.DataFrame([{'category': name, 'value': auc}]))

importances = (pd.concat(importances_, ignore_index=True)
                 .set_index('category'))
importances /= importances.value.sum()

importances = importances.reset_index()
importances = (importances.assign(reporter=importances.category.str.split('_').str[0],
                                  concentration=importances.category.str.split('_').str[1],
                                  timepoint=importances.category.str.split('_').str[2])
               ).drop(columns='category')

importances = importances.pivot(index=['reporter', 'timepoint'], columns='concentration', values='value') * 100
importances

Unnamed: 0_level_0,concentration,cmax1,cmax10,cmax100,cmax25,cmax5,cmax50
reporter,timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bip,tp24,0.705715,0.345843,0.216903,0.112859,0.417224,0.102223
bip,tp48,0.324726,1.654604,0.125684,0.144477,0.307734,0.159356
bip,tp72,0.258872,0.290374,2.862382,0.150929,0.431828,0.376236
btg2,tp24,1.199834,0.540015,0.134655,0.115066,2.773389,0.745783
btg2,tp48,0.19932,0.843821,0.092102,0.241312,2.678558,0.181152
btg2,tp72,0.813219,0.189861,0.11947,0.058446,0.410295,0.105886
chop,tp24,0.312875,0.223616,1.524237,0.523871,0.239511,0.298709
chop,tp48,0.096797,0.099977,1.867967,0.217607,0.487782,2.375727
chop,tp72,0.067532,0.673431,1.079659,0.444545,1.500177,0.06818
hmox1,tp24,0.178511,1.141315,1.034693,0.140493,0.071116,0.114615


Supp. Table 7.

In [27]:
importances.reset_index(level=1).iloc[:, 1:].reset_index().groupby('reporter').agg('sum').sum(axis=1).rename('total feature importance (%)').to_frame()

Unnamed: 0_level_0,total feature importance (%)
reporter,Unnamed: 1_level_1
bip,8.987968
btg2,11.442186
chop,12.102201
hmox1,11.187872
hspa1b,7.251298
icam1,12.602821
p21,16.120295
srxn1,20.305358


Supp. Table 8.

In [28]:
importances.sum(axis=1).rename('importnace').reset_index().pivot(index='reporter', columns='timepoint')

Unnamed: 0_level_0,importnace,importnace,importnace
timepoint,tp24,tp48,tp72
reporter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
bip,1.900767,2.716581,4.370621
btg2,5.508742,4.236266,1.697178
chop,3.122819,5.145857,3.833525
hmox1,2.680743,5.8243,2.682829
hspa1b,3.62824,2.452633,1.170425
icam1,,3.750881,8.85194
p21,6.468058,2.549938,7.102299
srxn1,11.003968,5.08777,4.21362


Supp. Table 9.

In [29]:
importances.reset_index(level=1).iloc[:, 1:].reset_index().groupby('reporter').agg('sum')

concentration,cmax1,cmax10,cmax100,cmax25,cmax5,cmax50
reporter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bip,1.289312,2.290821,3.204969,0.408265,1.156786,0.637815
btg2,2.212373,1.573697,0.346227,0.414824,5.862242,1.032822
chop,0.477204,0.997025,4.471863,1.186023,2.22747,2.742616
hmox1,1.431145,1.343439,2.377592,0.322261,5.471333,0.242102
hspa1b,0.385715,1.152988,3.21282,0.701271,1.130822,0.667681
icam1,1.029028,5.818796,1.160164,0.623869,3.49085,0.480114
p21,1.345824,4.420938,3.847407,0.391818,3.74724,2.367069
srxn1,0.721298,2.046842,3.554484,1.464885,0.720089,11.797759


Obtain Supp. Table 11.

In [30]:
# Importances per maker type
marker_importances = ((tmp := data.query('num_stds == 1 and descriptors == "bio" and dataset == "All" and variable in @bio_descs'))
                           .assign(category=tmp.variable
                                               .str.split('_')
                                               .str[-1])
                           .groupby(['category', 'samples added'])
                           .value.sum()
               )

importances_ = []
for name, group in marker_importances.reset_index().groupby('category'):
    auc = auc_fn(group['samples added']/250, group['value'])
    importances_.append(pd.DataFrame([{'category': name, 'value': auc}]))

marker_importances = (pd.concat(importances_, ignore_index=True)
                 .set_index('category'))
marker_importances /= marker_importances.value.sum()

marker_importances = marker_importances.reset_index()
marker_importances

Unnamed: 0,category,value
0,AnnexinPos,0.17425
1,CytoplasmNucleiIntegratedIntensityImageGfp,0.059486
2,CytoplasmNucleiMeanIntensityImageGfp,0.073349
3,DmsoIntegratedIntensityPlateLognorm,0.034517
4,DmsoMeanCellCounts,0.034757
5,GfpDiff1i,0.010474
6,GfpDiff2i,0.005004
7,GfpDiff3i,0.001908
8,GfpDiff4i,0.005373
9,GfpNeg1i,0.00864
