In [2]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'

path = "C:\\Users\\Pedro\\Desktop\\bicpams_5.1\\data\\latecovid\\"

In [8]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [14]:
filtered_data_01

Index(['Series1_NHBE_Mock_1', 'Series1_NHBE_Mock_2', 'Series1_NHBE_Mock_3',
       'Series9_NHBE_Mock_1', 'Series9_NHBE_Mock_2', 'Series9_NHBE_Mock_3',
       'Series9_NHBE_Mock_4', 'Series1_NHBE_SARS-CoV-2_1',
       'Series1_NHBE_SARS-CoV-2_2', 'Series1_NHBE_SARS-CoV-2_3',
       'Series9_NHBE_IAV_1', 'Series9_NHBE_IAV_2', 'Series9_NHBE_IAV_3',
       'Series9_NHBE_IAV_4', 'Series9_NHBE_IAVdNS1_1',
       'Series9_NHBE_IAVdNS1_2', 'Series9_NHBE_IAVdNS1_3',
       'Series9_NHBE_IAVdNS1_4', 'Series2_A549_Mock_1', 'Series2_A549_Mock_2',
       'Series2_A549_Mock_3', 'Series3_A549_Mock_1', 'Series3_A549_Mock_2',
       'Series4_A549_Mock_1', 'Series4_A549_Mock_2', 'Series5_A549_Mock_1',
       'Series5_A549_Mock_2', 'Series5_A549_Mock_3', 'Series8_A549_Mock_1',
       'Series8_A549_Mock_2', 'Series8_A549_Mock_3',
       'Series2_A549_SARS-CoV-2_1', 'Series2_A549_SARS-CoV-2_2',
       'Series2_A549_SARS-CoV-2_3', 'Series5_A549_SARS-CoV-2_1',
       'Series5_A549_SARS-CoV-2_2', 'Series5_A5

In [16]:
def get_info_from_name(col_name):
    result = {}
    info = col_name.split('_')
    result['Series'] = info[0]
    
    if info[1] == 'COVID19Lung':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'SARS-CoV-2'
        return result
    elif info[1] == 'HealthyLungBiopsy':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'Healthy'
        return result
        
    result['Cell Type'] = info[1]
    
    if info[2] == 'Mock':
        result['Condition'] = 'Healthy'
    else:
        result['Condition'] = info[2].replace('.', '-')
    
    return result

get_info_from_name('Series1_NHBE_Mock_1')

{'Series': 'Series1', 'Cell Type': 'NHBE', 'Condition': 'Healthy'}

In [17]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
classes

{'NHBE': {'Healthy': 0, 'SARS-CoV-2': 1, 'IAV': 2, 'IAVdNS1': 3},
 'A549': {'Healthy': 4, 'SARS-CoV-2': 5, 'IAV': 6, 'RSV': 7, 'HPIV3': 8},
 'Calu3': {'Healthy': 9, 'SARS-CoV-2': 10},
 'Biopsy': {'Healthy': 11, 'SARS-CoV-2': 12}}

In [18]:
y = []
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])

In [34]:
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond
        
y_names

['NHBE Healthy',
 'NHBE SARS-CoV-2',
 'NHBE IAV',
 'NHBE IAVdNS1',
 'A549 Healthy',
 'A549 SARS-CoV-2',
 'A549 IAV',
 'A549 RSV',
 'A549 HPIV3',
 'Calu3 Healthy',
 'Calu3 SARS-CoV-2',
 'Biopsy Healthy',
 'Biopsy SARS-CoV-2']

In [20]:
from sklearn import tree

clf = tree.DecisionTreeClassifier().fit(filtered_data_01, y)

In [42]:
dot_data = tree.export_graphviz(clf, out_file = None, 
                                feature_names = filtered_data_01.columns,
                                class_names = y_names,
                                filled = True, rounded = True,
                                special_characters = True)  
graph = graphviz.Source(dot_data)
graph.render('decision_tree')

'decision_tree.pdf'

### Dual class problems

##### NHBE

In [44]:
cols_healthy = dt.get_columns('NHBE', 'healthy')
cols_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels = [0] * len(cols_healthy) + [1] * len(cols_cov2)

nhbe_clf = tree.DecisionTreeClassifier().fit(filtered_data_01.T[cols_healthy + cols_cov2].T, labels)

nhbe_features = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_01.columns), reverse = True))]
ea.getEnrichment(nhbe_features[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'regulation of nucleic acid-templated transcription (GO:1903506)',
  2.720042607545891e-15,
  18.489919354838708,
  620.1172977521386,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  7.816866437947851e-13,
  0,
  0],
 [2,
  'regulation of cellular macromolecule biosynthetic process (GO:2000112)',
  5.3175962162910554e-15,
  17.74398450244698,
  583.2049359306333,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  7.816866437947851e-13,
  0,
  0],
 [3,
  'regulation of gene expression (GO:0010468)',
  1.7596116505410074e-13,
  12.411012782694199,
  364.4929912113615,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',


In [46]:
list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_01.columns), reverse = True))

[(1.0, 'EFNA1'),
 (0.0, 'ZSCAN9'),
 (0.0, 'ZSCAN29'),
 (0.0, 'ZNFX1'),
 (0.0, 'ZNF862'),
 (0.0, 'ZNF853'),
 (0.0, 'ZNF850'),
 (0.0, 'ZNF841'),
 (0.0, 'ZNF774'),
 (0.0, 'ZNF773'),
 (0.0, 'ZNF77'),
 (0.0, 'ZNF720'),
 (0.0, 'ZNF669'),
 (0.0, 'ZNF606'),
 (0.0, 'ZNF583'),
 (0.0, 'ZNF566'),
 (0.0, 'ZNF555'),
 (0.0, 'ZNF550'),
 (0.0, 'ZNF549'),
 (0.0, 'ZNF544'),
 (0.0, 'ZNF525'),
 (0.0, 'ZNF419'),
 (0.0, 'ZNF34'),
 (0.0, 'ZNF322'),
 (0.0, 'ZNF28'),
 (0.0, 'ZNF274'),
 (0.0, 'ZNF239'),
 (0.0, 'ZNF235'),
 (0.0, 'ZNF229'),
 (0.0, 'ZNF160'),
 (0.0, 'ZNF136'),
 (0.0, 'ZMYND12'),
 (0.0, 'ZFP28'),
 (0.0, 'ZCWPW1'),
 (0.0, 'ZCCHC9'),
 (0.0, 'ZBTB46'),
 (0.0, 'ZBTB25'),
 (0.0, 'YEATS2'),
 (0.0, 'XDH'),
 (0.0, 'XAF1'),
 (0.0, 'WWC2'),
 (0.0, 'WTAPP1'),
 (0.0, 'WISP2'),
 (0.0, 'WHAMMP1'),
 (0.0, 'WDR41'),
 (0.0, 'WDPCP'),
 (0.0, 'VTRNA1-1'),
 (0.0, 'VSTM1'),
 (0.0, 'VNN1'),
 (0.0, 'VEGFA'),
 (0.0, 'VDAC2'),
 (0.0, 'VAV1'),
 (0.0, 'UTP23'),
 (0.0, 'USP53'),
 (0.0, 'USP40'),
 (0.0, 'USP16'),
 (0.0, 'USP12'

##### A549

In [41]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

a549_clf = tree.DecisionTreeClassifier().fit(filtered_data_01.T[cols_healthy_A549 + cols_cov2_A549].T, labels)

a549_features = [x[1] for x in list(sorted(zip(a549_clf.feature_importances_, filtered_data_01.columns), reverse = True))]
ea.getEnrichment(a549_features[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'response to interferon-beta (GO:0035456)',
  1.3849289100030806e-05,
  79.52393617021276,
  889.6562746439253,
  ['PLSCR1', 'PNPT1', 'STAT1'],
  0.003612564388407677,
  0,
  0],
 [2,
  'negative regulation of viral life cycle (GO:1903901)',
  1.6236244442281695e-05,
  30.347826086956523,
  334.68385319402523,
  ['PLSCR1', 'TNIP1', 'EIF2AK2', 'PTX3'],
  0.003612564388407677,
  0,
  0],
 [3,
  'negative regulation of I-kappaB kinase/NF-kappaB signaling (GO:0043124)',
  0.00022061092484915678,
  28.87717601547389,
  243.12011931420915,
  ['TLE1', 'TNIP1', 'STAT1'],
  0.02360838382802231,
  0,
  0],
 [4,
  'organic hydroxy compound biosynthetic process (GO:1901617)',
  0.0002497855682645102,
  27.618871415356153,
  229.09599013803128,
  ['SRD5A2', 'PRKAG2', 'OSBPL1A'],
  0.02360838382802231,
  0,
  0],
 [5,
  'negative regulation of viral genome replication (GO:0045071)',
  0.00026526273964070014,
  27.02987777274785,
  222.5853605029436,
  ['PLSCR1', 'TNIP1', 'EIF2AK2'],
  0.02360