In [3]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.ensemble import RandomForestClassifier

path = "C:\\Users\\Pedro\\Documents\\BicPAMS\\bicpams_5.1\\data\\latecovid\\"

In [2]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [6]:
def get_info_from_name(col_name):
    result = {}
    info = col_name.split('_')
    result['Series'] = info[0]
    
    if info[1] == 'COVID19Lung':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'SARS-CoV-2'
        return result
    elif info[1] == 'HealthyLungBiopsy':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'Healthy'
        return result
        
    result['Cell Type'] = info[1]
    
    if info[2] == 'Mock':
        result['Condition'] = 'Healthy'
    else:
        result['Condition'] = info[2].replace('.', '-')
    
    return result

get_info_from_name('Series1_NHBE_Mock_1')

{'Series': 'Series1', 'Cell Type': 'NHBE', 'Condition': 'Healthy'}

In [7]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
y = []
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])
    
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond


In [8]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier().fit(filtered_data_01, y)

In [9]:
list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True))

[(0.0122433467178862, 'ADD2'),
 (0.010797045470757693, 'IFI44'),
 (0.010493632977433411, 'SAMD9L'),
 (0.008098830875092982, 'MX1'),
 (0.007583522859060924, 'HLA-F'),
 (0.007319031993395626, 'DGKD'),
 (0.006848147436078276, 'HCN4'),
 (0.006828596820908477, 'PCDH1'),
 (0.006688009722708388, 'ARHGEF4'),
 (0.006639923416476135, 'AK5'),
 (0.006589237121221835, 'CXCL2'),
 (0.006449882812926755, 'CXCL5'),
 (0.006444395198529445, 'IFI16'),
 (0.006340056538855956, 'IFIT2'),
 (0.0063240363645374, 'ESPNL'),
 (0.006321870959342382, 'OAS2'),
 (0.006238803646074347, 'OASL'),
 (0.006144568868744148, 'GBP1'),
 (0.005971099449229936, 'IL6'),
 (0.00584488398700784, 'PNMA2'),
 (0.005811934685618139, 'SERPINB7'),
 (0.0057574386016660825, 'AXIN2'),
 (0.005731354664748839, 'CA8'),
 (0.0057244712956326105, 'CCBE1'),
 (0.00568260604467128, 'ELL2'),
 (0.005635056936095317, 'PLSCR1'),
 (0.005299835787787837, 'TYMP'),
 (0.005125747802530001, 'IL7'),
 (0.00509443518734378, 'IL1A'),
 (0.005057121613252174, 'CD274'

### Two class comparison

##### NHBE

In [6]:
cols_healthy = dt.get_columns('NHBE', 'healthy')
cols_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels = [0] * len(cols_healthy) + [1] * len(cols_cov2)

data = dt.get_data('NHBE', 'healthy', 'sars-cov2')

filtered_data_NHBE = dt.get_p_values('mannwhitneyu', data, cols_healthy, cols_cov2)
filtered_data_NHBE

Unnamed: 0,Series1_NHBE_Mock_1,Series1_NHBE_Mock_2,Series1_NHBE_Mock_3,Series9_NHBE_Mock_1,Series9_NHBE_Mock_2,Series9_NHBE_Mock_3,Series9_NHBE_Mock_4,Series1_NHBE_SARS-CoV-2_1,Series1_NHBE_SARS-CoV-2_2,Series1_NHBE_SARS-CoV-2_3,p-value
SAMD11,2.484907,3.044522,2.197225,0.000000,0.693147,0.693147,1.791759,3.295837,2.397895,4.158883,0.033706
RNF223,2.079442,1.609438,2.302585,2.484907,2.995732,3.091042,3.737670,1.945910,1.609438,1.609438,0.032455
TNFRSF4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.693147,0.016684
TAS1R3,2.197225,0.693147,1.609438,2.564949,2.079442,2.397895,3.091042,0.000000,0.000000,0.693147,0.014685
CALML6,1.609438,1.945910,2.079442,0.000000,0.693147,0.693147,1.098612,2.397895,1.609438,2.564949,0.042729
...,...,...,...,...,...,...,...,...,...,...,...
PNMA3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.693147,1.791759,0.002422
ZNF275,4.553877,4.477337,4.976734,3.951244,3.850148,3.367296,4.465908,4.584967,4.553877,5.283204,0.043208
PDZD4,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.693147,1.386294,0.693147,1.098612,0.013243
FAM50A,7.008505,6.966967,7.418781,6.056784,6.003887,6.415097,6.830874,7.143618,7.044905,8.110728,0.034124


In [10]:
filtered_data.drop(['p-value'], axis = 1, inplace = True)
nhbe_clf = RandomForestClassifier().fit(filtered_data_NHBE.T, labels)

nhbe_features = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.columns), reverse = True))]
results_nhbe = ea.getEnrichment(nhbe_features[:50], 'GO_Biological_Process_2021')['GO_Biological_Process_2021']

ValueError: Found input variables with inconsistent numbers of samples: [11, 10]

In [11]:
import json

with open('results_RandomForest_NHBE.json', 'w') as file:
     file.write(json.dumps(results_nhbe)) # use `json.loads` to do the reverse

In [5]:
import json

with open('results_RandomForest_NHBE.json') as file:
    results_nhbe = json.load(file)

In [6]:
dataset = {'p-value': [], 'Score': []}
index_nhbe = []

for term in results_nhbe:
    index_nhbe += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
enrichment_nhbe_dataset = pd.DataFrame(dataset, index = index_nhbe)

In [12]:
pd.set_option("display.max_rows", None)
selection = enrichment_nhbe_dataset[enrichment_nhbe_dataset['p-value'] < 0.01].sort_values('Score', ascending = False).head(25)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

selection.to_csv('NHBE_RandomForest_table.csv')
selection

Unnamed: 0,p-value,Score
positive regulation of glial cell proliferation (GO:0060252),4.63e-05,10018.35
regulation of fever generation (GO:0031620),4.63e-05,10018.35
positive regulation of gliogenesis (GO:0014015),7.95e-05,4611.05
regulation of glial cell proliferation (GO:0060251),0.000129,2888.72
positive regulation of heat generation (GO:0031652),0.00181,2688.79
chronic inflammatory response (GO:0002544),0.00181,2688.79
positive regulation of fever generation (GO:0031622),0.00241,1932.58
positive regulation of neuroinflammatory response (GO:0150078),0.000275,1790.11
regulation of calcidiol 1-monooxygenase activity (GO:0060558),0.00303,1490.32
positive regulation of acute inflammatory response (GO:0002675),0.000478,1265.87


##### A549

In [21]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

a549_clf = RandomForestClassifier().fit(filtered_data_05.T[cols_healthy_A549 + cols_cov2_A549].T, labels)

a549_features = [x[1] for x in list(sorted(zip(a549_clf.feature_importances_, filtered_data_01.columns), reverse = True))]
results_a549 = ea.getEnrichment(a549_features[:50], 'GO_Biological_Process_2021')['GO_Biological_Process_2021']

results_a549

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'regulation of transcription by RNA polymerase II (GO:0006357)',
  1.839878027412889e-21,
  17.393301104972377,
  830.4359895275164,
  ['ZNF28',
   'ZNF550',
   'ZNF274',
   'SP100',
   'ZBTB25',
   'ZBTB46',
   'ZSCAN9',
   'WWC2',
   'ZNF606',
   'ZNF549',
   'ZNF669',
   'ZNF229',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF566',
   'ZNF322',
   'ZNF583',
   'ZNF160',
   'ZNF77',
   'ZNF34',
   'YEATS2',
   'ZNF419',
   'ZNF239',
   'ZNF853',
   'ZNF555',
   'ZNF235',
   'ZNF774',
   'ZNF136',
   'IRF9',
   'ZNF850',
   'ZNF773',
   'ZSCAN29',
   'ZFP28'],
  2.1408799394076017e-19,
  0,
  0],
 [2,
  'regulation of transcription, DNA-templated (GO:0006355)',
  3.1953431931456743e-21,
  17.057692307692307,
  804.9967213017426,
  ['ZNF28',
   'ZNF550',
   'ZNF274',
   'SP100',
   'ZBTB25',
   'ZBTB46',
   'ZSCAN9',
   'WWC2',
   'ZNF606',
   'ZNF549',
   'ZNF669',
   'ZNF229',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF566',
   'ZNF322',
   'ZNF583',
   'ZNF160',
   'ZNF77'

In [22]:
import json

with open('results_RandomForest_A549.json', 'w') as file:
     file.write(json.dumps(results_a549)) # use `json.loads` to do the reverse

In [23]:
import json

with open('results_RandomForest_A549.json') as file:
    results_a549 = json.load(file)

In [24]:
dataset = {'p-value': [], 'Score': []}
index_a549 = []

for term in results_a549:
    index_a549 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
enrichment_a549_dataset = pd.DataFrame(dataset, index = index_a549)

In [25]:
pd.set_option("display.max_rows", None)
selection = enrichment_a549_dataset[enrichment_a549_dataset['p-value'] < 0.01].sort_values('Score', ascending = False).head(25)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

selection.to_csv('A549_RandomForest_table.csv')
selection

Unnamed: 0,p-value,Score
regulation of transcription by RNA polymerase II (GO:0006357),2.14e-19,830.44
"regulation of transcription, DNA-templated (GO:0006355)",2.14e-19,805.0
negative regulation of transcription by RNA polymerase II (GO:0000122),3.42e-05,113.78
"negative regulation of transcription, DNA-templated (GO:0045892)",0.000585,62.7


##### Calu3

In [None]:
cols_healthy_A549_ACE = dt.get_columns('A549-ACE2', 'healthy')
cols_cov2_A549_ACE = dt.get_columns('A549-ACE2', 'sars-cov2')

labels = [0] * len(cols_healthy_A549_ACE) + [1] * len(cols_cov2_A549_ACE)

a549_ace_clf = RandomForestClassifier().fit(filtered_data_05.T[cols_healthy_A549_ACE + cols_cov2_A549_ACE].T, labels)

a549_ace_features = [x[1] for x in list(sorted(zip(a549_ace_clf.feature_importances_, filtered_data_01.columns), reverse = True))]
ea.getEnrichment(a549_ace_features[:50], 'GO_Biological_Process_2021')['GO_Biological_Process_2021']

In [14]:
import json

with open('results_RandomForest_A549.json', 'w') as file:
     file.write(json.dumps(results_nhbe)) # use `json.loads` to do the reverse

In [5]:
import json

with open('results_RandomForest_A549.json') as file:
    results_nhbe = json.load(file)