In [1]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.ensemble import RandomForestClassifier


path = "C:\\Users\\Pedro\\Documents\\BicPAMS\\bicpams_5.1\\data\\latecovid\\"

In [2]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [3]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = dt.get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
y = []
for c in filtered_data_01.index:
    info = dt.get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])
    
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond


In [9]:
parameters = {
    'random_state': 42, 
    'n_estimators': 400, 
    'criterion': "gini", 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1,
    'max_features': "sqrt"
}

stats_NHBE = dt.apply_loocv(filtered_data_01.values, np.array(y), RandomForestClassifier(**parameters))
stats_NHBE

{'accuracy': 0.7222222222222222,
 'importances': array([8.56441442e-04, 2.82466380e-04, 5.37398112e-04, 1.17349760e-03,
        8.49581017e-04, 1.14338058e-03, 2.77549308e-04, 2.18494621e-04,
        8.85642198e-04, 8.90924907e-04, 4.47734496e-04, 3.50338507e-04,
        5.21170945e-03, 2.29875147e-03, 1.33301720e-03, 5.31363548e-04,
        2.55205488e-04, 5.80701471e-04, 2.11764536e-03, 9.05716773e-04,
        5.26974526e-04, 6.98323077e-04, 2.96946668e-03, 1.10703752e-03,
        3.70697181e-04, 8.95482998e-04, 2.60075461e-04, 9.11706735e-04,
        1.10385131e-03, 5.35236429e-03, 8.48820643e-03, 3.66969815e-04,
        3.19958397e-03, 3.95027133e-03, 1.29912698e-03, 1.61337270e-03,
        3.73599233e-03, 7.70866787e-04, 1.56781591e-03, 6.42450276e-04,
        6.83624958e-04, 6.66555776e-04, 9.15926153e-04, 7.69489935e-03,
        9.36740418e-04, 6.85645790e-04, 1.59841592e-03, 1.33163213e-03,
        2.44687017e-03, 3.52125392e-04, 3.95289543e-04, 1.58239412e-03,
        7.598058

In [10]:
clf = RandomForestClassifier(**parameters).fit(filtered_data_01, y)

list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True))

[(0.00960148744377525, 'IFI44'),
 (0.00926479270765407, 'MX1'),
 (0.008547204040578417, 'IFI16'),
 (0.00829574814081365, 'SAMD9L'),
 (0.007661586861898575, 'CASP1'),
 (0.007660076919951023, 'IL1A'),
 (0.006973063890350157, 'PPM1K'),
 (0.006655079301900687, 'IFIT3'),
 (0.006329698717123201, 'IRF7'),
 (0.006162682740944891, 'SPTBN4'),
 (0.005685016251784805, 'HERC5'),
 (0.005586416672388264, 'CD274'),
 (0.00553461816986264, 'ADD2'),
 (0.005515853481909572, 'EBI3'),
 (0.005294724380188274, 'DDX58'),
 (0.005229563651304597, 'SAMD9'),
 (0.005135989232412648, 'OAS2'),
 (0.004964098933560959, 'PTPRE'),
 (0.0047357610351874205, 'PARP10'),
 (0.004679902874974196, 'LAMP3'),
 (0.004634312684384535, 'AK5'),
 (0.004508247898711002, 'OASL'),
 (0.004443974810791738, 'ADAMTS16'),
 (0.004333029106508079, 'IFIH1'),
 (0.004293795085819635, 'CXCL5'),
 (0.004248002610536502, 'ZNF28'),
 (0.0041702376747888995, 'SP110'),
 (0.00396576073310437, 'HBEGF'),
 (0.0038628836831076257, 'THEMIS2'),
 (0.00385554057708

In [12]:
selected_genes = [x[1] for x in list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True)) if x[0] > 0.001]
selected_genes

['IFI44',
 'MX1',
 'IFI16',
 'SAMD9L',
 'CASP1',
 'IL1A',
 'PPM1K',
 'IFIT3',
 'IRF7',
 'SPTBN4',
 'HERC5',
 'CD274',
 'ADD2',
 'EBI3',
 'DDX58',
 'SAMD9',
 'OAS2',
 'PTPRE',
 'PARP10',
 'LAMP3',
 'AK5',
 'OASL',
 'ADAMTS16',
 'IFIH1',
 'CXCL5',
 'ZNF28',
 'SP110',
 'HBEGF',
 'THEMIS2',
 'REEP2',
 'IL1B',
 'FAM131B',
 'ANKRD26P1',
 'CTPS1',
 'CXCL2',
 'RGS11',
 'IL6',
 'CCL5',
 'TRANK1',
 'SAA1',
 'PDZK1IP1',
 'FRRS1',
 'CFB',
 'HCN4',
 'BCL2A1',
 'LCP1',
 'TRIM16L',
 'ICAM1',
 'CDCP1',
 'IL7',
 'CYS1',
 'IFIT2',
 'CELF5',
 'IFITM1',
 'GJB2',
 'KLF2',
 'TYMP',
 'HCAR3',
 'BMPER',
 'XAF1',
 'BIRC3',
 'THBD',
 'PMAIP1',
 'ANO9',
 'HSD17B14',
 'EFEMP1',
 'HLA-B',
 'LOC100507053',
 'ADRB2',
 'HLA-F',
 'DDX60',
 'TNFAIP2',
 'VAV1',
 'GFPT2',
 'NALCN',
 'NRTN',
 'CDH3',
 'SDC4',
 'LAMC2',
 'PITPNA-AS1',
 'GREM1',
 'LOC100268168',
 'SLCO4C1',
 'CSGALNACT1',
 'STAT1',
 'PLSCR1',
 'ITGA7',
 'TUBB2B',
 'SOCS3',
 'ARHGEF4',
 'SCG5',
 'HTR7',
 'UBE2L6',
 'HSPA12A',
 'SYNGR3',
 'C1R',
 'WDPCP',
 'M

In [2]:
len(['IFI44',
 'MX1',
 'IFI16',
 'SAMD9L',
 'CASP1',
 'IL1A',
 'PPM1K',
 'IFIT3',
 'IRF7',
 'SPTBN4',
 'HERC5',
 'CD274',
 'ADD2',
 'EBI3',
 'DDX58',
 'SAMD9',
 'OAS2',
 'PTPRE',
 'PARP10',
 'LAMP3',
 'AK5',
 'OASL',
 'ADAMTS16',
 'IFIH1',
 'CXCL5',
 'ZNF28',
 'SP110',
 'HBEGF',
 'THEMIS2',
 'REEP2',
 'IL1B',
 'FAM131B',
 'ANKRD26P1',
 'CTPS1',
 'CXCL2',
 'RGS11',
 'IL6',
 'CCL5',
 'TRANK1',
 'SAA1',
 'PDZK1IP1',
 'FRRS1',
 'CFB',
 'HCN4',
 'BCL2A1',
 'LCP1',
 'TRIM16L',
 'ICAM1',
 'CDCP1',
 'IL7',
 'CYS1',
 'IFIT2',
 'CELF5',
 'IFITM1',
 'GJB2',
 'KLF2',
 'TYMP',
 'HCAR3',
 'BMPER',
 'XAF1',
 'BIRC3',
 'THBD',
 'PMAIP1',
 'ANO9',
 'HSD17B14',
 'EFEMP1',
 'HLA-B',
 'LOC100507053',
 'ADRB2',
 'HLA-F',
 'DDX60',
 'TNFAIP2',
 'VAV1',
 'GFPT2',
 'NALCN',
 'NRTN',
 'CDH3',
 'SDC4',
 'LAMC2',
 'PITPNA-AS1',
 'GREM1',
 'LOC100268168',
 'SLCO4C1',
 'CSGALNACT1',
 'STAT1',
 'PLSCR1',
 'ITGA7',
 'TUBB2B',
 'SOCS3',
 'ARHGEF4',
 'SCG5',
 'HTR7',
 'UBE2L6',
 'HSPA12A',
 'SYNGR3',
 'C1R',
 'WDPCP',
 'MT2A',
 'PCDH1',
 'C1QTNF1',
 'DTX3L',
 'FHL5',
 'C6orf58',
 'AOX1',
 'TNNI3',
 'GBP3',
 'WISP2',
 'MAP2',
 'PARP8',
 'PTAFR',
 'KRT35',
 'C1S',
 'MB',
 'PCK2',
 'PPP1R16B',
 'CA8',
 'TMEM9B-AS1',
 'RUSC2',
 'GBP1',
 'KRT34',
 'PLAUR',
 'CCDC15',
 'RELB',
 'PTX3',
 'APOL1',
 'BSPRY',
 'EMP1',
 'FEV',
 'CLGN',
 'LOC100506801',
 'CYP39A1',
 'WTAPP1',
 'NT5E',
 'JAK3',
 'FILIP1L',
 'PARP12',
 'COL5A2',
 'HMGN2P46',
 'AJUBA',
 'CCL20',
 'RHCG',
 'PDGFC',
 'TOX2',
 'CLCN4',
 'NGEF',
 'TGFA',
 'KYNU',
 'CCL2',
 'UACA',
 'TAP1',
 'VNN1',
 'SP140L',
 'TFF1',
 'AMIGO2',
 'AXIN2',
 'FAM161A',
 'PTENP1',
 'LOC100288911',
 'LIPH',
 'NFKBIA',
 'DHRS13',
 'RGS17',
 'CXCL3',
 'STAT5A',
 'TDRD7',
 'NCOA7',
 'C3AR1',
 'ZNF566',
 'GRIN2B',
 'ITGA2B',
 'PAQR5',
 'IL15RA',
 'NYAP1',
 'AGAP11',
 'C15orf48',
 'IRAK2',
 'PARP14',
 'TNNT1',
 'NKX2-8',
 'HHIP',
 'LINC00696',
 'PPP1R3B',
 'HKDC1',
 'CD22',
 'TMC8',
 'RFXAP',
 'STX11',
 'ST3GAL5',
 'PLIN2',
 'CXCL6',
 'CMTM3',
 'BIK',
 'PXDC1',
 'PNMA2',
 'NFE2L3',
 'ZNF525',
 'CEP290',
 'ETV7',
 'FBLIM1',
 'IRF9',
 'GRIN3B',
 'ZNF229',
 'STC1',
 'SAA2',
 'CD40',
 'KCNK6',
 'PLEKHA4',
 'DDX60L',
 'SYT1',
 'DMC1',
 'LOC400655',
 'SERPINB8',
 'IL1RAP',
 'GPR19',
 'COL4A6',
 'CTGF',
 'CPAMD8',
 'IAH1',
 'STAT2',
 'FOXD1',
 'ESPNL',
 'DAW1',
 'C16orf87',
 'LINC00662',
 'UGCG',
 'XDH',
 'NMNAT1',
 'MAP1B',
 'ZNF853',
 'ODF3B',
 'PNPT1',
 'IFIT5',
 'MMP10',
 'P2RX5',
 'GPSM3',
 'SERPINB4',
 'TLR2',
 'LOXL2',
 'FBXO27',
 'FRMD6',
 'ROR1',
 'CCBE1',
 'PARP9',
 'DGKD',
 'CXCL1',
 'ITPRIP',
 'TIPARP',
 'PTGES',
 'EREG',
 'GBP2',
 'AXL',
 'PCOLCE2',
 'WHAMMP1',
 'MYBL1',
 'CTF1',
 'EEF1A2',
 'PNPLA4',
 'AHRR',
 'ADTRP',
 'C19orf12',
 'ELL2',
 'NMI',
 'QSOX1',
 'TRIB3',
 'TSGA10',
 'PRSS22',
 'KRT80',
 'TLE1',
 'IRF1',
 'GPR161',
 'NKAIN1',
 'PI3',
 'DUSP5',
 'LTB',
 'KCTD11',
 'TNFAIP3',
 'TMC6',
 'KIAA0319',
 'DLL1',
 'C3',
 'TRIM36',
 'SIDT1',
 'OLR1',
 'HAUS4',
 'HK2',
 'IL7R',
 'LOC254896',
 'DNAJC27-AS1',
 'MARVELD3',
 'IL23A',
 'PHF11',
 'MOBP',
 'COL4A5',
 'ZNF841',
 'CDKL2',
 'SNORD83A',
 'CSF2',
 'SPINT1',
 'ERN1',
 'ADAM23',
 'SELE',
 'CPQ',
 'TNF',
 'TRMT10B',
 'SLC6A4',
 'TRABD2A',
 'BROX',
 'NPAS2',
 'CYB5R2',
 'TCEAL4',
 'SERPINB7',
 'FAM221A',
 'NUAK2',
 'RAB3B',
 'GCH1',
 'MATN3',
 'STAT4',
 'FOXL1',
 'ZNF550',
 'MMP11',
 'BTC',
 'USP12',
 'DRAM1',
 'IL32',
 'LONRF3',
 'MED30',
 'MYPN',
 'CCDC91',
 'TGM4',
 'ALDH1A3',
 'ISG20',
 'SOCS2',
 'SLC25A37',
 'FXYD5',
 'SRGN',
 'MCAM',
 'LRRC37B',
 'CDKL1',
 'GADD45A',
 'FAM49A',
 'C12orf76',
 'LAMB3',
 'TCTE3',
 'RPL7L1',
 'STAC',
 'HLA-C',
 'C11orf96',
 'PTPRH',
 'BATF2',
 'SEMA6B',
 'PHLDA1',
 'LCN2',
 'STARD13',
 'PRKCE',
 'INHBE',
 'PLA2R1'])

356

In [13]:
results = ea.getEnrichment(selected_genes, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'cytokine-mediated signaling pathway (GO:0019221)',
  3.375087819316678e-27,
  6.8951438069085125,
  420.28229418333177,
  ['CXCL6',
   'IFITM1',
   'CD40',
   'CSF2',
   'IFIT5',
   'CXCL1',
   'IL1RAP',
   'CXCL3',
   'TNF',
   'CXCL2',
   'IFIT3',
   'CXCL5',
   'IFIT2',
   'ICAM1',
   'OASL',
   'MT2A',
   'CASP1',
   'JAK3',
   'IL15RA',
   'HLA-B',
   'HLA-C',
   'HLA-F',
   'VAV1',
   'EREG',
   'IL1A',
   'IL23A',
   'OAS2',
   'IL1B',
   'IRF1',
   'IRF7',
   'LCP1',
   'LTB',
   'IRF9',
   'BIRC3',
   'CTF1',
   'EBI3',
   'PTAFR',
   'SOCS2',
   'SOCS3',
   'UGCG',
   'IRAK2',
   'CCL5',
   'STAT4',
   'CCL2',
   'GBP2',
   'GBP1',
   'STAT5A',
   'IL32',
   'CCL20',
   'STAT1',
   'STAT2',
   'MX1',
   'ISG20',
   'NFKBIA',
   'IL6',
   'IL7',
   'SAA1',
   'LCN2',
   'XAF1',
   'IL7R'],
  7.668199525487492e-24,
  0,
  0],
 [2,
  'cellular response to type I interferon (GO:0071357)',
  4.8794249612679427e-17,
  22.204834445423643,
  833.9895828690367,
  ['IFITM1',
  

In [14]:
import json

with open('results_RandomForest.json', 'w') as file:
     #file.write(json.dumps(results)) # use `json.loads` to do the reverse

In [23]:
import json

with open('results_RandomForest.json') as file:
    results = json.load(file)

In [15]:
dataset = {'p-value': [], 'Score': []}
index = []

for term in results:
    index += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
enrichment_dataset = pd.DataFrame(dataset, index = index)

In [16]:
pd.set_option("display.max_rows", None)
selection = enrichment_dataset[enrichment_dataset['p-value'] < 0.01].sort_values('Score', ascending = False).head(25)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('RandomForest_table.csv')
selection

Unnamed: 0,p-value,Score
protein mono-ADP-ribosylation (GO:0140289),3.44e-06,980.5
type I interferon signaling pathway (GO:0060337),3.7e-14,833.99
cellular response to type I interferon (GO:0071357),3.7e-14,833.99
regulation of fever generation (GO:0031620),0.00202,819.46
positive regulation of glial cell proliferation (GO:0060252),0.00202,819.46
cytokine-mediated signaling pathway (GO:0019221),7.67e-24,420.28
interferon-gamma-mediated signaling pathway (GO:0060333),3.65e-09,372.42
negative regulation of viral genome replication (GO:0045071),3.35e-08,368.56
antigen processing and presentation of endogenous peptide antigen via MHC class I via ER pathway (GO:0002484),0.00502,358.52
"antigen processing and presentation of endogenous peptide antigen via MHC class I via ER pathway, TAP-independent (GO:0002486)",0.00502,358.52


### Two class comparison

##### NHBE

In [5]:
cols_nhbe_healthy = dt.get_columns('NHBE', 'healthy')
cols_nhbe_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels_nhbe = [0] * len(cols_nhbe_healthy) + [1] * len(cols_nhbe_cov2)

data_nhbe = dt.get_data('NHBE', 'healthy', 'sars-cov2')

filtered_data_NHBE = dt.get_p_values('mannwhitneyu', data_nhbe, cols_nhbe_healthy, cols_nhbe_cov2)
filtered_data_NHBE.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_NHBE.index)

         Series1_NHBE_Mock_1  Series1_NHBE_Mock_2  Series1_NHBE_Mock_3  \
SAMD11              2.484907             3.044522             2.197225   
RNF223              2.079442             1.609438             2.302585   
TNFRSF4             0.000000             0.000000             0.000000   
TAS1R3              2.197225             0.693147             1.609438   
CALML6              1.609438             1.945910             2.079442   
...                      ...                  ...                  ...   
PNMA3               0.000000             0.000000             0.000000   
ZNF275              4.553877             4.477337             4.976734   
PDZD4               0.000000             0.000000             0.000000   
FAM50A              7.008505             6.966967             7.418781   
SPRY3               2.944439             2.302585             3.555348   

         Series9_NHBE_Mock_1  Series9_NHBE_Mock_2  Series9_NHBE_Mock_3  \
SAMD11              0.000000         

In [71]:
parameters = {
    'random_state': 42, 
    'n_estimators': 200, 
    'criterion': "gini", 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1,
    'max_features': "sqrt"
}

stats_NHBE = dt.apply_loocv(filtered_data_NHBE.T.values, np.array(labels_nhbe), RandomForestClassifier(**parameters))
stats_NHBE

{'accuracy': 0.8,
 'importances': array([0.        , 0.0042495 , 0.        , 0.0062518 , 0.0011236 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.0042495 , 0.00527514, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00153846,
        0.        , 0.00317488, 0.        , 0.00051282, 0.        ,
        0.00527514, 0.        , 0.        , 0.0005618 , 0.0042495 ,
        0.        , 0.00471334, 0.        , 0.00358974, 0.        ,
        0.        , 0.0005618 , 0.        , 0.00476232, 0.        ,
        0.00224719, 0.        , 0.        , 0.00102564, 0.00471334,
        0.        , 0.        , 0.00578796, 0.        , 0.0005618 ,
        0.        , 0.        , 0.        , 0.        , 0.0005618 ,
        0.00051282, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.00527514, 0.        , 0.00051282, 0.00373668,
        0.0025641 , 0.00163642, 0.        , 0.0042495 , 0.00527514,
        0.00205

In [72]:
nhbe_clf = RandomForestClassifier(**parameters).fit(filtered_data_NHBE.T, labels_nhbe)

list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.T.columns), reverse = True))

[(0.010362694300518135, 'WFDC5'),
 (0.010362694300518135, 'TNFAIP2'),
 (0.010362694300518135, 'TICAM1'),
 (0.010362694300518135, 'TAL2'),
 (0.010362694300518135, 'SNORD43'),
 (0.010362694300518135, 'SLC26A9'),
 (0.010362694300518135, 'RNF223'),
 (0.010362694300518135, 'OAS3'),
 (0.010362694300518135, 'MILR1'),
 (0.010362694300518135, 'MAPK3'),
 (0.010362694300518135, 'LGALS9'),
 (0.010362694300518135, 'KRTAP3-1'),
 (0.010362694300518135, 'KLF2'),
 (0.010362694300518135, 'KIAA1656'),
 (0.010362694300518135, 'IRF9'),
 (0.010362694300518135, 'GRIP2'),
 (0.010362694300518135, 'DTX2'),
 (0.010362694300518135, 'DACH2'),
 (0.010362694300518135, 'COL24A1'),
 (0.010362694300518135, 'CCL20'),
 (0.0051813471502590676, 'ZSWIM4'),
 (0.0051813471502590676, 'ZDHHC23'),
 (0.0051813471502590676, 'WTAPP1'),
 (0.0051813471502590676, 'VNN1'),
 (0.0051813471502590676, 'UCN'),
 (0.0051813471502590676, 'UBE2L6'),
 (0.0051813471502590676, 'TRIML2'),
 (0.0051813471502590676, 'TRIM14'),
 (0.0051813471502590676,

In [73]:
selected_genes_nhbe = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.T.columns), reverse = True)) if x[0] > 0]
selected_genes_nhbe

['WFDC5',
 'TNFAIP2',
 'TICAM1',
 'TAL2',
 'SNORD43',
 'SLC26A9',
 'RNF223',
 'OAS3',
 'MILR1',
 'MAPK3',
 'LGALS9',
 'KRTAP3-1',
 'KLF2',
 'KIAA1656',
 'IRF9',
 'GRIP2',
 'DTX2',
 'DACH2',
 'COL24A1',
 'CCL20',
 'ZSWIM4',
 'ZDHHC23',
 'WTAPP1',
 'VNN1',
 'UCN',
 'UBE2L6',
 'TRIML2',
 'TRIM14',
 'TPI1P2',
 'TPGS1',
 'TOX',
 'TNIP3',
 'TNFSF14',
 'TNFRSF10A',
 'TNF',
 'TMEM95',
 'TMEM74',
 'TMEM31',
 'TMEM255B',
 'TLCD1',
 'TGM5',
 'TAS2R4',
 'TAS1R3',
 'STAT5A',
 'SNX32',
 'SNORD33',
 'SNORD116-27',
 'SMA4',
 'SLFNL1-AS1',
 'SLC44A4',
 'SLC43A3',
 'SLC11A2',
 'SHBG',
 'SEPT5-GP1BB',
 'SENP3-EIF4A1',
 'SEC14L4',
 'SAMD9L',
 'SAA4',
 'SAA2',
 'RPGRIP1',
 'ROR2',
 'RNU6ATAC',
 'RNF122',
 'RHOF',
 'RASGRP1',
 'PYY2',
 'PON3',
 'PLEKHG7',
 'PLEKHD1',
 'PLCL1',
 'PLA2G4C',
 'PGLYRP4',
 'PDPN',
 'PCDHGA2',
 'PAX1',
 'PARP14',
 'PAPPA',
 'P2RY6',
 'OR1M1',
 'OLFM1',
 'NME8',
 'NHSL2',
 'NFYA',
 'NFKB2',
 'NES',
 'NELL2',
 'NEFM',
 'NEFH',
 'NCF1',
 'MYPN',
 'MX2',
 'MUC22',
 'MRGPRX3',
 'MMP9'

In [1]:
len(['WFDC5',
 'TNFAIP2',
 'TICAM1',
 'TAL2',
 'SNORD43',
 'SLC26A9',
 'RNF223',
 'OAS3',
 'MILR1',
 'MAPK3',
 'LGALS9',
 'KRTAP3-1',
 'KLF2',
 'KIAA1656',
 'IRF9',
 'GRIP2',
 'DTX2',
 'DACH2',
 'COL24A1',
 'CCL20',
 'ZSWIM4',
 'ZDHHC23',
 'WTAPP1',
 'VNN1',
 'UCN',
 'UBE2L6',
 'TRIML2',
 'TRIM14',
 'TPI1P2',
 'TPGS1',
 'TOX',
 'TNIP3',
 'TNFSF14',
 'TNFRSF10A',
 'TNF',
 'TMEM95',
 'TMEM74',
 'TMEM31',
 'TMEM255B',
 'TLCD1',
 'TGM5',
 'TAS2R4',
 'TAS1R3',
 'STAT5A',
 'SNX32',
 'SNORD33',
 'SNORD116-27',
 'SMA4',
 'SLFNL1-AS1',
 'SLC44A4',
 'SLC43A3',
 'SLC11A2',
 'SHBG',
 'SEPT5-GP1BB',
 'SENP3-EIF4A1',
 'SEC14L4',
 'SAMD9L',
 'SAA4',
 'SAA2',
 'RPGRIP1',
 'ROR2',
 'RNU6ATAC',
 'RNF122',
 'RHOF',
 'RASGRP1',
 'PYY2',
 'PON3',
 'PLEKHG7',
 'PLEKHD1',
 'PLCL1',
 'PLA2G4C',
 'PGLYRP4',
 'PDPN',
 'PCDHGA2',
 'PAX1',
 'PARP14',
 'PAPPA',
 'P2RY6',
 'OR1M1',
 'OLFM1',
 'NME8',
 'NHSL2',
 'NFYA',
 'NFKB2',
 'NES',
 'NELL2',
 'NEFM',
 'NEFH',
 'NCF1',
 'MYPN',
 'MX2',
 'MUC22',
 'MRGPRX3',
 'MMP9',
 'MIR642A',
 'MIR5188',
 'MIR2117',
 'MIR205',
 'MARCO',
 'LY96',
 'LOC728989',
 'LOC399815',
 'LOC254896',
 'LOC100128573',
 'LINGO1',
 'LINC00472',
 'KMO',
 'IZUMO4',
 'ITGB2',
 'IRAK3',
 'INHBA-AS1',
 'IL36G',
 'IL2RG',
 'IL23A',
 'IL21R',
 'IL1B',
 'IFITM3',
 'IFIT3',
 'IFI6',
 'IFI35',
 'HSPA7',
 'HSH2D',
 'HIST1H2AI',
 'HEY2',
 'HERC5',
 'HERC2P4',
 'HBEGF',
 'GRIN1',
 'GPRASP2',
 'GPR37L1',
 'GAS5-AS1',
 'GABRR2',
 'FXYD5',
 'FAM86B2',
 'FAM83A',
 'FAM167A',
 'ENTPD3',
 'EMILIN2',
 'ECSCR',
 'EBI3',
 'DTX3L',
 'CYP27A1',
 'CXCL3',
 'CXCL2',
 'CTNS',
 'CSF3',
 'CSF1',
 'CNTD1',
 'CNGB1',
 'CLCN1',
 'CEACAM19',
 'CD34',
 'CD27-AS1',
 'CCL22',
 'CALML6',
 'CA14',
 'C6orf99',
 'C6orf223',
 'C3',
 'C2orf88',
 'C1QTNF1',
 'C17orf67',
 'BPGM',
 'ASTN1',
 'ARRDC3-AS1',
 'ARRDC2',
 'ARL10',
 'AQP7P1',
 'APLNR',
 'AGAP4',
 'ADAMTS9',
 'ACTBL2',
 'ACSL5'])

173

In [74]:
results_nhbe = ea.getEnrichment(selected_genes_nhbe, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_nhbe

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'cytokine-mediated signaling pathway (GO:0019221)',
  2.7286236423086907e-11,
  5.71693820385297,
  139.06245628578603,
  ['IFITM3',
   'CSF3',
   'CSF1',
   'EBI3',
   'ITGB2',
   'IFI6',
   'IFI35',
   'IL2RG',
   'CXCL3',
   'TNF',
   'CXCL2',
   'IFIT3',
   'IL21R',
   'MAPK3',
   'STAT5A',
   'CCL22',
   'TNFSF14',
   'CCL20',
   'MX2',
   'IL36G',
   'IRAK3',
   'MMP9',
   'IL23A',
   'IL1B',
   'OAS3',
   'IRF9'],
  3.8528165829398716e-08,
  0,
  0],
 [2,
  'cellular response to type I interferon (GO:0071357)',
  1.464177081192166e-06,
  14.372974657249689,
  193.0896632498229,
  ['IFITM3', 'OAS3', 'MX2', 'IFI6', 'IFI35', 'IFIT3', 'IRF9'],
  0.0006891393462144462,
  0,
  0],
 [3,
  'type I interferon signaling pathway (GO:0060337)',
  1.464177081192166e-06,
  14.372974657249689,
  193.0896632498229,
  ['IFITM3', 'OAS3', 'MX2', 'IFI6', 'IFI35', 'IFIT3', 'IRF9'],
  0.0006891393462144462,
  0,
  0],
 [4,
  'cellular response to cytokine stimulus (GO:0071345)',
  4.7302141799

In [75]:
import json

with open('results_RandomForest_NHBE.json', 'w') as file:
     #file.write(json.dumps(results_nhbe)) # use `json.loads` to do the reverse

In [49]:
import json

with open('results_RandomForest_NHBE.json') as file:
    results_nhbe = json.load(file)

In [76]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_nhbe = []

cols_nhbe_healthy = dt.get_columns('NHBE', 'healthy')
cols_nhbe_cov2 = dt.get_columns('NHBE', 'sars-cov2')

data_nhbe = dt.get_data('NHBE', 'healthy', 'sars-cov2')

for term in results_nhbe:
    index_nhbe += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_nhbe.loc[genes, cols_nhbe_healthy].values, axis = 1) - np.mean(data_nhbe.loc[genes, cols_nhbe_cov2].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
enrichment_nhbe_dataset = pd.DataFrame(dataset, index = index_nhbe)

In [78]:
pd.set_option("display.max_rows", None)
selection = enrichment_nhbe_dataset[enrichment_nhbe_dataset['p-value'] < 0.05].sort_values('Score', ascending = False)#.head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('NHBE_RandomForest_table.csv')
selection

Unnamed: 0,p-value,Score,Value
positive regulation of monocyte chemotactic protein-1 production (GO:0071639),0.00406,718.81,"3 up, 0 down"
chronic inflammatory response (GO:0002544),0.0224,558.06,"2 up, 0 down"
positive regulation of glial cell proliferation (GO:0060252),0.0224,558.06,"2 up, 0 down"
positive regulation of heat generation (GO:0031652),0.0224,558.06,"2 up, 0 down"
response to salt stress (GO:0009651),0.0224,558.06,"2 up, 0 down"
regulation of fever generation (GO:0031620),0.0224,558.06,"2 up, 0 down"
regulation of monocyte chemotactic protein-1 production (GO:0071637),0.0081,402.78,"3 up, 0 down"
positive regulation of fever generation (GO:0031622),0.027,395.36,"2 up, 0 down"
ISG15-protein conjugation (GO:0032020),0.027,395.36,"2 up, 0 down"
positive regulation of histone phosphorylation (GO:0033129),0.027,395.36,"2 up, 0 down"


##### A549

In [33]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels_a549 = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

data_a549 = dt.get_data('A549', 'healthy', 'sars-cov2')

filtered_data_a549 = dt.get_p_values('mannwhitneyu', data_a549, cols_healthy_A549, cols_cov2_A549, limit = 0.001)
filtered_data_a549.drop(['p-value'], axis = 1, inplace = True)

filtered_data_a549

Unnamed: 0,Series2_A549_Mock_1,Series2_A549_Mock_2,Series2_A549_Mock_3,Series3_A549_Mock_1,Series3_A549_Mock_2,Series4_A549_Mock_1,Series4_A549_Mock_2,Series5_A549_Mock_1,Series5_A549_Mock_2,Series5_A549_Mock_3,Series8_A549_Mock_1,Series8_A549_Mock_2,Series8_A549_Mock_3,Series2_A549_SARS-CoV-2_1,Series2_A549_SARS-CoV-2_2,Series2_A549_SARS-CoV-2_3,Series5_A549_SARS-CoV-2_1,Series5_A549_SARS-CoV-2_2,Series5_A549_SARS-CoV-2_3
TNFRSF9,4.454347,3.583519,3.135494,3.637586,3.555348,1.945910,3.044522,3.806662,3.951244,3.784190,2.197225,2.484907,2.397895,4.584967,4.330733,4.174387,5.370638,5.497168,5.966147
PIK3CD,5.863631,5.370638,5.093750,5.786897,5.468060,4.248495,4.927254,5.872118,5.683580,5.659482,3.258097,4.406719,4.990433,6.011267,6.016157,5.774552,6.259581,6.505784,6.955593
RBP7,3.465736,2.890372,1.791759,1.386294,1.945910,0.000000,0.693147,3.496508,3.401197,3.555348,2.995732,1.945910,2.302585,3.970292,4.189655,3.465736,4.007333,4.174387,4.997212
DNAJC16,5.973810,5.293305,5.187386,5.680173,5.713733,4.454347,5.036953,5.820083,5.916202,5.673323,2.772589,4.663439,4.927254,6.265301,6.021023,5.774552,6.059123,6.192362,6.716595
FBLIM1,6.654153,6.150603,5.834811,6.647688,6.393591,4.762174,5.342334,6.637258,6.520621,6.285998,5.111988,5.236442,5.762051,6.900731,6.769642,6.612041,6.917706,6.813445,7.399398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOSPD1,5.877736,5.420535,4.976734,4.828314,5.298317,4.127134,4.762174,5.634790,5.669881,5.480639,2.639057,4.564348,5.056246,6.543912,5.918894,5.579730,6.331502,6.490724,6.911747
FHL1,6.298949,5.652489,5.455321,5.886104,5.950643,5.375278,6.276643,6.196444,6.298949,6.082219,2.890372,4.691348,5.117994,6.687109,6.453625,6.324359,6.984716,7.044905,7.576097
TMEM185A,5.493061,4.836282,4.727388,4.343805,4.499810,3.496508,4.276666,5.313206,5.075174,5.023881,2.944439,4.356709,4.317488,5.590987,5.609472,5.220356,5.402677,5.497168,5.755742
L1CAM,5.389072,5.030438,4.543295,4.912655,4.976734,4.043051,4.605170,5.062595,4.976734,4.955827,2.197225,4.043051,4.465908,5.552960,5.484797,5.164786,6.161207,6.603944,7.064759


In [34]:
len(filtered_data_a549.index)

614

In [28]:
parameters = {
    'random_state': 42, 
    'n_estimators': 100, 
    'criterion': "gini", 
    'max_depth': None, 
    'min_samples_split': 2, 
    'min_samples_leaf': 1,
    'max_features': "sqrt"
}

results_a549 = dt.apply_loocv(filtered_data_a549.T.values, np.array(labels_a549), RandomForestClassifier(**parameters))
results_a549

{'accuracy': 0.8947368421052632,
 'importances': array([0., 0., 0., ..., 0., 0., 0.])}

In [35]:
clf_a549 = RandomForestClassifier(**parameters).fit(filtered_data_a549.T.values, np.array(labels_a549))

list(sorted(zip(clf_a549.feature_importances_, filtered_data_a549.T.columns), reverse = True))

[(0.04, 'ZNF773'),
 (0.03, 'USP16'),
 (0.03, 'TUBB2B'),
 (0.03, 'PARP9'),
 (0.03, 'NIPAL1'),
 (0.03, 'EREG'),
 (0.02, 'ZSCAN29'),
 (0.02, 'ZNF28'),
 (0.02, 'VSTM1'),
 (0.02, 'STAT1'),
 (0.02, 'LOC728024'),
 (0.02, 'HSPA12A'),
 (0.02, 'FRRS1'),
 (0.02, 'FHL1'),
 (0.01, 'ZNF774'),
 (0.01, 'ZNF550'),
 (0.01, 'ZNF229'),
 (0.01, 'ZNF217'),
 (0.01, 'TRIM25'),
 (0.01, 'TRABD2A'),
 (0.01, 'TNFAIP8'),
 (0.01, 'TMEM159'),
 (0.01, 'TMEM156'),
 (0.01, 'TDRD7'),
 (0.01, 'STAC'),
 (0.01, 'ST3GAL5'),
 (0.01, 'SRGAP2D'),
 (0.01, 'SLC37A3'),
 (0.01, 'SLC2A1'),
 (0.01, 'SDE2'),
 (0.01, 'RASGRF2'),
 (0.01, 'PXDN'),
 (0.01, 'PTPDC1'),
 (0.01, 'PPP1R15B'),
 (0.01, 'PNPT1'),
 (0.01, 'PLCG2'),
 (0.01, 'NUAK2'),
 (0.01, 'NAV3'),
 (0.01, 'MYPN'),
 (0.01, 'MYO18B'),
 (0.01, 'MOCOS'),
 (0.01, 'MMP12'),
 (0.01, 'MAML2'),
 (0.01, 'LINC00525'),
 (0.01, 'LAMC2'),
 (0.01, 'KLC1'),
 (0.01, 'KCNK6'),
 (0.01, 'JDP2'),
 (0.01, 'IL1A'),
 (0.01, 'IGFL2'),
 (0.01, 'IFNAR2'),
 (0.01, 'IFIT5'),
 (0.01, 'HCN4'),
 (0.01, 'GID4'

In [36]:
selected_genes_a549 = [x[1] for x in list(sorted(zip(clf_a549.feature_importances_, filtered_data_a549.T.columns), reverse = True)) if x[0] > 0]
selected_genes_a549

['ZNF773',
 'USP16',
 'TUBB2B',
 'PARP9',
 'NIPAL1',
 'EREG',
 'ZSCAN29',
 'ZNF28',
 'VSTM1',
 'STAT1',
 'LOC728024',
 'HSPA12A',
 'FRRS1',
 'FHL1',
 'ZNF774',
 'ZNF550',
 'ZNF229',
 'ZNF217',
 'TRIM25',
 'TRABD2A',
 'TNFAIP8',
 'TMEM159',
 'TMEM156',
 'TDRD7',
 'STAC',
 'ST3GAL5',
 'SRGAP2D',
 'SLC37A3',
 'SLC2A1',
 'SDE2',
 'RASGRF2',
 'PXDN',
 'PTPDC1',
 'PPP1R15B',
 'PNPT1',
 'PLCG2',
 'NUAK2',
 'NAV3',
 'MYPN',
 'MYO18B',
 'MOCOS',
 'MMP12',
 'MAML2',
 'LINC00525',
 'LAMC2',
 'KLC1',
 'KCNK6',
 'JDP2',
 'IL1A',
 'IGFL2',
 'IFNAR2',
 'IFIT5',
 'HCN4',
 'GID4',
 'GBP1',
 'FMNL2',
 'FAM131B',
 'EPAS1',
 'DRAM1',
 'DDX60',
 'DDX58',
 'DAW1',
 'CYP1B1',
 'CTNNAL1',
 'CSF2RA',
 'CRKL',
 'CLMP',
 'CHST11',
 'CDKL1',
 'CDCP1',
 'CCBE1',
 'BTC',
 'BCL2L1',
 'B4GALNT2',
 'AXL',
 'ATP2B4',
 'AOX1',
 'ACOT9',
 'CD274',
 'NOMO3']

In [70]:
enrichment_a549 = ea.getEnrichment(selected_genes_a549, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
enrichment_a549

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'cytokine-mediated signaling pathway (GO:0019221)',
  2.7286236423086907e-11,
  5.71693820385297,
  139.06245628578603,
  ['IFITM3',
   'CSF3',
   'CSF1',
   'EBI3',
   'ITGB2',
   'IFI6',
   'IFI35',
   'IL2RG',
   'CXCL3',
   'TNF',
   'CXCL2',
   'IFIT3',
   'IL21R',
   'MAPK3',
   'STAT5A',
   'CCL22',
   'TNFSF14',
   'CCL20',
   'MX2',
   'IL36G',
   'IRAK3',
   'MMP9',
   'IL23A',
   'IL1B',
   'OAS3',
   'IRF9'],
  3.8528165829398716e-08,
  0,
  0],
 [2,
  'cellular response to type I interferon (GO:0071357)',
  1.464177081192166e-06,
  14.372974657249689,
  193.0896632498229,
  ['IFITM3', 'OAS3', 'MX2', 'IFI6', 'IFI35', 'IFIT3', 'IRF9'],
  0.0006891393462144462,
  0,
  0],
 [3,
  'type I interferon signaling pathway (GO:0060337)',
  1.464177081192166e-06,
  14.372974657249689,
  193.0896632498229,
  ['IFITM3', 'OAS3', 'MX2', 'IFI6', 'IFI35', 'IFIT3', 'IRF9'],
  0.0006891393462144462,
  0,
  0],
 [4,
  'cellular response to cytokine stimulus (GO:0071345)',
  4.7302141799

In [38]:
import json

with open('results_RandomForest_A549.json', 'w') as file:
     file.write(json.dumps(enrichment_a549)) # use `json.loads` to do the reverse

In [7]:
import json

with open('results_RandomForest_A549.json') as file:
    enrichment_a549 = json.load(file)

In [42]:
dataset = {'p-value': [], 'Score': []} #, 'Value': []
index_a549 = []

cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

data_a549 = dt.get_data('A549', 'healthy', 'sars-cov2')

for term in enrichment_a549:
    index_a549 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_a549.loc[genes, cols_healthy_A549].values, axis = 1) - np.mean(data_a549.loc[genes, cols_cov2_A549].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    #dataset['Value'] += ['%d up, %d down' % (ups, downs)]
enrichment_a549_dataset = pd.DataFrame(dataset, index = index_a549)

In [44]:
pd.set_option("display.max_rows", None)
selection = enrichment_a549_dataset[enrichment_a549_dataset['p-value'] < 0.05].sort_values('Score', ascending = False)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

selection.to_csv('A549_RandomForest_table.csv')
selection

Unnamed: 0,p-value,Score
RIG-I signaling pathway (GO:0039529),0.0201,819.47
positive regulation of dendritic cell cytokine production (GO:0002732),0.0208,658.6
cytoplasmic pattern recognition receptor signaling pathway in response to virus (GO:0039528),0.0315,463.95
positive regulation of epidermal growth factor-activated receptor activity (GO:0045741),0.0365,401.14
positive regulation of vascular endothelial growth factor production (GO:0010575),0.0152,314.5
regulation of vascular endothelial growth factor production (GO:0010574),0.0152,280.69
positive regulation of nuclear division (GO:0051785),0.0152,280.69
positive regulation of defense response to virus by host (GO:0002230),0.0152,266.02
response to interferon-beta (GO:0035456),0.0152,266.02
regulation of interleukin-2 production (GO:0032663),0.0152,240.53


##### Calu3

In [61]:
cols_healthy_Calu3 = dt.get_columns('Calu3', 'healthy')
cols_cov2_Calu3 = dt.get_columns('Calu3', 'sars-cov2')

labels_calu3 = [0] * len(cols_healthy_Calu3) + [1] * len(cols_cov2_Calu3)

data_calu3 = dt.get_data('Calu3', 'healthy', 'sars-cov2')

filtered_data_calu3 = dt.get_p_values('mannwhitneyu', data_calu3, cols_healthy_Calu3, cols_cov2_Calu3, limit = 0.05)
filtered_data_calu3.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_calu3.index)

1014

In [81]:
parameters = {
    'random_state': 42, 
    'n_estimators': 100, 
    'criterion': "entropy", 
    'max_depth': None, 
    'min_samples_split': 3, 
    'min_samples_leaf': 2,
    'max_features': "log2"
}

stats_calu3 = dt.apply_loocv(filtered_data_calu3.T.values, np.array(labels_calu3), RandomForestClassifier(**parameters))
stats_calu3

{'accuracy': 1.0,
 'importances': array([0.02380952, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ])}

In [82]:
clf_calu3 = RandomForestClassifier(**parameters).fit(filtered_data_calu3.T.values, np.array(labels_calu3))

list(sorted(zip(clf_calu3.feature_importances_, filtered_data_calu3.T.columns), reverse = True))

[(0.02531645569620253, 'RAET1L'),
 (0.02531645569620253, 'LUZP2'),
 (0.012658227848101266, 'WDR88'),
 (0.012658227848101266, 'VWF'),
 (0.012658227848101266, 'UBL4B'),
 (0.012658227848101266, 'TTPA'),
 (0.012658227848101266, 'TRAF1'),
 (0.012658227848101266, 'TNFSF13B'),
 (0.012658227848101266, 'TLR6'),
 (0.012658227848101266, 'THEM6'),
 (0.012658227848101266, 'TAP1'),
 (0.012658227848101266, 'STEAP3'),
 (0.012658227848101266, 'STAT5A'),
 (0.012658227848101266, 'SPATA17'),
 (0.012658227848101266, 'SPAG8'),
 (0.012658227848101266, 'SOSTDC1'),
 (0.012658227848101266, 'SNRNP25'),
 (0.012658227848101266, 'SLIRP'),
 (0.012658227848101266, 'SLC9A3R1'),
 (0.012658227848101266, 'SLC5A9'),
 (0.012658227848101266, 'SLC25A18'),
 (0.012658227848101266, 'SLC17A4'),
 (0.012658227848101266, 'SERPINB7'),
 (0.012658227848101266, 'SCN3A'),
 (0.012658227848101266, 'SAP30'),
 (0.012658227848101266, 'SAMSN1'),
 (0.012658227848101266, 'SAMD9L'),
 (0.012658227848101266, 'SAC3D1'),
 (0.012658227848101266, 'RUF

In [83]:
selected_genes_calu3 = [x[1] for x in list(sorted(zip(clf_calu3.feature_importances_, filtered_data_calu3.T.columns), reverse = True)) if x[0] > 0]
selected_genes_calu3

['RAET1L',
 'LUZP2',
 'WDR88',
 'VWF',
 'UBL4B',
 'TTPA',
 'TRAF1',
 'TNFSF13B',
 'TLR6',
 'THEM6',
 'TAP1',
 'STEAP3',
 'STAT5A',
 'SPATA17',
 'SPAG8',
 'SOSTDC1',
 'SNRNP25',
 'SLIRP',
 'SLC9A3R1',
 'SLC5A9',
 'SLC25A18',
 'SLC17A4',
 'SERPINB7',
 'SCN3A',
 'SAP30',
 'SAMSN1',
 'SAMD9L',
 'SAC3D1',
 'RUFY4',
 'RPGRIP1L',
 'RLN2',
 'RHCG',
 'RAD21L1',
 'RAB42',
 'PDE3B',
 'OR1F2P',
 'OIP5',
 'NT5DC2',
 'NLRP3',
 'NGFR',
 'NCOA7',
 'MPND',
 'MORN2',
 'MMP13',
 'MIPEP',
 'MCM6',
 'LRP4-AS1',
 'KCNT2',
 'KCNH3',
 'ISG15',
 'INHBE',
 'IFNL3',
 'IFITM1',
 'HSD17B14',
 'HRASLS2',
 'HDX',
 'GBP4',
 'G0S2',
 'FAM173A',
 'FABP5P3',
 'ENG',
 'ELN',
 'EGR2',
 'EFEMP2',
 'EBP',
 'DUOX2',
 'DUOX1',
 'DHCR24',
 'CRYZ',
 'CLRN3',
 'CHST13',
 'CELF6',
 'CCL5',
 'CCDC150',
 'BCKDHB',
 'ASB18',
 'ANKS4B']

In [84]:
results_calu3 = ea.getEnrichment(selected_genes_calu3, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_calu3

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'negative regulation of viral genome replication (GO:0045071)',
  5.549248819159771e-05,
  21.7786301369863,
  213.41452218943027,
  ['IFITM1', 'CCL5', 'ISG15', 'IFNL3'],
  0.038938394732418516,
  0,
  0],
 [2,
  'regulation of viral genome replication (GO:0045069)',
  0.00012950010356735047,
  17.27332028701892,
  154.62780734833802,
  ['IFITM1', 'CCL5', 'ISG15', 'IFNL3'],
  0.038938394732418516,
  0,
  0],
 [3,
  'negative regulation of viral process (GO:0048525)',
  0.00015356744758848547,
  16.485678704856788,
  144.76685577490605,
  ['IFITM1', 'CCL5', 'ISG15', 'IFNL3'],
  0.038938394732418516,
  0,
  0],
 [4,
  'hydrogen peroxide biosynthetic process (GO:0050665)',
  0.00021726782324248188,
  132.79333333333332,
  1120.0294025235596,
  ['DUOX1', 'DUOX2'],
  0.038938394732418516,
  0,
  0],
 [5,
  'positive regulation of vascular associated smooth muscle cell differentiation (GO:1905065)',
  0.0003034160628500144,
  106.22933333333333,
  860.5006812675136,
  ['EFEMP2', 'ENG'

In [135]:
import json

with open('results_RandomForest_Calu3.json', 'w') as file:
     #file.write(json.dumps(results_calu3)) # use `json.loads` to do the reverse

In [11]:
import json

with open('results_RandomForest_Calu3.json') as file:
    results_calu3 = json.load(file)

In [85]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_calu3 = []

cols_healthy_Calu3 = dt.get_columns('Calu3', 'healthy')
cols_cov2_Calu3 = dt.get_columns('Calu3', 'sars-cov2')

data_calu3 = dt.get_data('Calu3', 'healthy', 'sars-cov2')

for term in results_calu3:
    index_calu3 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_calu3.loc[genes, cols_healthy_Calu3].values, axis = 1) - np.mean(data_calu3.loc[genes, cols_cov2_Calu3].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
    
enrichment_calu3_dataset = pd.DataFrame(dataset, index = index_calu3)

In [87]:
pd.set_option("display.max_rows", None)
selection = enrichment_calu3_dataset[enrichment_calu3_dataset['p-value'] < 0.05].sort_values('Score', ascending = False).head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

selection.to_csv('Calu3_RandomForest_table.csv')
selection

Unnamed: 0,p-value,Score
hydrogen peroxide biosynthetic process (GO:0050665),0.0389,1120.03
positive regulation of vascular associated smooth muscle cell differentiation (GO:1905065),0.0389,860.5
vascular associated smooth muscle cell development (GO:0097084),0.0389,860.5
vascular associated smooth muscle cell differentiation (GO:0035886),0.0444,691.8
smooth muscle tissue development (GO:0048745),0.0454,487.64
negative regulation of viral genome replication (GO:0045071),0.0389,213.41
regulation of viral genome replication (GO:0045069),0.0389,154.63
negative regulation of viral process (GO:0048525),0.0389,144.77
response to amyloid-beta (GO:1904645),0.0454,144.31
cation transport (GO:0006812),0.0454,60.77
