In [1]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'

path = "C:\\Users\\Pedro\\Desktop\\bicpams_5.1\\data\\latecovid\\"

In [2]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [3]:
def get_info_from_name(col_name):
    result = {}
    info = col_name.split('_')
    result['Series'] = info[0]
    
    if info[1] == 'COVID19Lung':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'SARS-CoV-2'
        return result
    elif info[1] == 'HealthyLungBiopsy':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'Healthy'
        return result
        
    result['Cell Type'] = info[1]
    
    if info[2] == 'Mock':
        result['Condition'] = 'Healthy'
    else:
        result['Condition'] = info[2].replace('.', '-')
    
    return result

get_info_from_name('Series1_NHBE_Mock_1')

{'Series': 'Series1', 'Cell Type': 'NHBE', 'Condition': 'Healthy'}

In [4]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
y = []
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])
    
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond


In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier().fit(filtered_data_01, y)

In [19]:
list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True))

[(0.011969475969213383, 'SAMD9L'),
 (0.011814986009909378, 'IL1A'),
 (0.0112953930675092, 'OAS2'),
 (0.009498833973875518, 'IFIT3'),
 (0.008633014557674828, 'CDCP1'),
 (0.008623878790623542, 'IL6'),
 (0.00858706252930824, 'HERC5'),
 (0.008349835852277783, 'OASL'),
 (0.008019867744418083, 'GBP1'),
 (0.007219135527734965, 'TNF'),
 (0.007163137917836516, 'IFI16'),
 (0.006883772606347995, 'PPM1K'),
 (0.006639815758223993, 'EMP1'),
 (0.006607410879667086, 'CD274'),
 (0.006284171582036613, 'GFPT2'),
 (0.006281661702326804, 'NIPAL1'),
 (0.006104092261591542, 'NUCB2'),
 (0.006080480369271217, 'IFI44'),
 (0.005888852560586325, 'TTC25'),
 (0.0058077266036663374, 'CXCL5'),
 (0.005770415895439724, 'FAM161A'),
 (0.005588008538286, 'PARP8'),
 (0.005438792587068795, 'IL1B'),
 (0.0052334880228396465, 'SPTBN4'),
 (0.0052161435685395205, 'THEMIS2'),
 (0.005205272192151377, 'C1S'),
 (0.005089973018898302, 'MB'),
 (0.005046677564408187, 'HBEGF'),
 (0.004955106166470544, 'CLGN'),
 (0.004948927634956854, 'H

### Two class comparison

##### NHBE

In [34]:
cols_healthy = dt.get_columns('NHBE', 'healthy')
cols_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels = [0] * len(cols_healthy) + [1] * len(cols_cov2)

In [35]:
nhbe_clf = RandomForestClassifier().fit(filtered_data_01.T[cols_healthy + cols_cov2].T, labels)

In [36]:
nhbe_features = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_01.columns), reverse = True))]

In [39]:
ea.getEnrichment(nhbe_features[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'cytokine-mediated signaling pathway (GO:0019221)',
  6.620764864828674e-19,
  22.88133874239351,
  957.7878044857332,
  ['IL32',
   'STAT5A',
   'STAT1',
   'EBI3',
   'PTAFR',
   'LIF',
   'SOD2',
   'CXCL2',
   'TNF',
   'IFIT3',
   'ICAM1',
   'NFKBIA',
   'OAS2',
   'IL23A',
   'IRAK2',
   'IL1B',
   'SAA1',
   'TRIM25',
   'XAF1',
   'IL7R',
   'IRF9'],
  4.978815178351162e-16,
  0,
  0],
 [2,
  'cellular response to cytokine stimulus (GO:0071345)',
  9.462417978313953e-10,
  13.873399715504979,
  288.26875478952667,
  ['IL32',
   'STAT5A',
   'IL23A',
   'IRAK2',
   'STAT1',
   'IL1B',
   'PTAFR',
   'LIF',
   'SAA1',
   'CXCL2',
   'TNF',
   'ICAM1'],
  3.557869159846046e-07,
  0,
  0],
 [3,
  'regulation of NF-kappaB import into nucleus (GO:0042345)',
  5.026078302033674e-09,
  105.44444444444444,
  2014.898433534034,
  ['NFKBIA', 'IL23A', 'IL1B', 'TNF', 'TLR2'],
  1.259870294376441e-06,
  0,
  0],
 [4,
  'interferon-gamma-mediated signaling pathway (GO:0060333)',
  2.0

##### A549

In [43]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

a549_clf = RandomForestClassifier().fit(filtered_data_05.T[cols_healthy_A549 + cols_cov2_A549].T, labels)

a549_features = [x[1] for x in list(sorted(zip(a549_clf.feature_importances_, filtered_data_01.columns), reverse = True))]
ea.getEnrichment(a549_features[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'regulation of nucleic acid-templated transcription (GO:1903506)',
  2.720042607545891e-15,
  18.489919354838708,
  620.1172977521386,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  3.4298495595077307e-13,
  0,
  0],
 [2,
  'regulation of cellular macromolecule biosynthetic process (GO:2000112)',
  5.3175962162910554e-15,
  17.74398450244698,
  583.2049359306333,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  3.4298495595077307e-13,
  0,
  0],
 [3,
  'regulation of gene expression (GO:0010468)',
  2.1264607740085488e-12,
  11.398314215096013,
  306.347500141178,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',