In [15]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'

path = "C:\\Users\\Pedro\\Desktop\\bicpams_5.1\\data\\latecovid\\"

In [16]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [17]:
def get_info_from_name(col_name):
    result = {}
    info = col_name.split('_')
    result['Series'] = info[0]
    
    if info[1] == 'COVID19Lung':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'SARS-CoV-2'
        return result
    elif info[1] == 'HealthyLungBiopsy':
        result['Cell Type'] = 'Biopsy'
        result['Condition'] = 'Healthy'
        return result
        
    result['Cell Type'] = info[1]
    
    if info[2] == 'Mock':
        result['Condition'] = 'Healthy'
    else:
        result['Condition'] = info[2].replace('.', '-')
    
    return result

get_info_from_name('Series1_NHBE_Mock_1')

{'Series': 'Series1', 'Cell Type': 'NHBE', 'Condition': 'Healthy'}

In [18]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
y = []
for c in filtered_data_01.index:
    info = get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])
    
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond


In [19]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

clf = xgb.XGBClassifier()

clf.fit(filtered_data_01, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
genes = [x[1] for x in list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True))]
genes

['IL1A',
 'NKAIN1',
 'FSIP1',
 'SAMD9L',
 'EBI3',
 'IFI16',
 'CXCL1',
 'NRTN',
 'UBE2L6',
 'FHL5',
 'DCLK1',
 'PHF11',
 'THEMIS2',
 'GBP2',
 'GFPT2',
 'HCN4',
 'HERC5',
 'TNFRSF9',
 'PDZK1IP1',
 'SSH1',
 'CFB',
 'CASP4',
 'CLDN16',
 'MOCOS',
 'NMNAT1',
 'SPATA21',
 'KRT35',
 'GCH1',
 'KYNU',
 'ICAM1',
 'IFIT3',
 'TLR2',
 'FBLIM1',
 'FHDC1',
 'CCBE1',
 'STX12',
 'MT2A',
 'SAA1',
 'ZBTB25',
 'CX3CL1',
 'C22orf24',
 'RGS17',
 'TRANK1',
 'GRID1',
 'PTPRH',
 'IFITM1',
 'IFI44',
 'SAMD9',
 'FAM49A',
 'SYTL2',
 'SLC6A4',
 'ADD2',
 'PCK2',
 'CXCL5',
 'ANKRD26P1',
 'CATSPER2P1',
 'MCAM',
 'IRF2BPL',
 'IGFBP1',
 'PPP1R16B',
 'ARHGEF4',
 'ZNF862',
 'LOC254896',
 'PITPNA-AS1',
 'MOBP',
 'IRF7',
 'IFIT2',
 'CCL20',
 'GPSM3',
 'MYPN',
 'HBEGF',
 'TRABD2A',
 'ADORA2A',
 'ZNF239',
 'CCL5',
 'PLIN2',
 'JUN',
 'BMPER',
 'TFF1',
 'STC2',
 'FOXD1',
 'RPL36A',
 'MAP1B',
 'NMI',
 'PARP10',
 'C3AR1',
 'MB',
 'CCL2',
 'XAF1',
 'HSF2BP',
 'REEP2',
 'PI3',
 'DAW1',
 'TCTE3',
 'ZSCAN9',
 'ZSCAN29',
 'ZNFX1',
 'Z

In [22]:
for gene in genes:
    print(gene)

IL1A
NKAIN1
FSIP1
SAMD9L
EBI3
IFI16
CXCL1
NRTN
UBE2L6
FHL5
DCLK1
PHF11
THEMIS2
GBP2
GFPT2
HCN4
HERC5
TNFRSF9
PDZK1IP1
SSH1
CFB
CASP4
CLDN16
MOCOS
NMNAT1
SPATA21
KRT35
GCH1
KYNU
ICAM1
IFIT3
TLR2
FBLIM1
FHDC1
CCBE1
STX12
MT2A
SAA1
ZBTB25
CX3CL1
C22orf24
RGS17
TRANK1
GRID1
PTPRH
IFITM1
IFI44
SAMD9
FAM49A
SYTL2
SLC6A4
ADD2
PCK2
CXCL5
ANKRD26P1
CATSPER2P1
MCAM
IRF2BPL
IGFBP1
PPP1R16B
ARHGEF4
ZNF862
LOC254896
PITPNA-AS1
MOBP
IRF7
IFIT2
CCL20
GPSM3
MYPN
HBEGF
TRABD2A
ADORA2A
ZNF239
CCL5
PLIN2
JUN
BMPER
TFF1
STC2
FOXD1
RPL36A
MAP1B
NMI
PARP10
C3AR1
MB
CCL2
XAF1
HSF2BP
REEP2
PI3
DAW1
TCTE3
ZSCAN9
ZSCAN29
ZNFX1
ZNF853
ZNF850
ZNF841
ZNF774
ZNF773
ZNF77
ZNF720
ZNF669
ZNF606
ZNF583
ZNF566
ZNF555
ZNF550
ZNF549
ZNF544
ZNF525
ZNF419
ZNF34
ZNF322
ZNF28
ZNF274
ZNF235
ZNF229
ZNF160
ZNF136
ZMYND12
ZFP28
ZCWPW1
ZCCHC9
ZBTB46
YEATS2
XDH
WWC2
WTAPP1
WISP2
WHAMMP1
WDR41
WDPCP
VTRNA1-1
VSTM1
VNN1
VEGFA
VDAC2
VAV1
UTP23
USP53
USP40
USP16
USP12
UGCG
UBC
UBASH3B
UACA
TYMP
TXNDC9
TUBB2B
TTC39B
TTC25
TSGA10
TRMT10B

In [14]:
ea.getEnrichment(genes[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'cytokine-mediated signaling pathway (GO:0019221)',
  3.557153131429945e-07,
  8.76444884161926,
  130.14448499805303,
  ['IL1A',
   'IFITM1',
   'MT2A',
   'TNFRSF9',
   'EBI3',
   'SAA1',
   'CXCL1',
   'GBP2',
   'CX3CL1',
   'IFIT3',
   'ICAM1'],
  0.00016540762061149244,
  0,
  0],
 [2,
  'response to interferon-gamma (GO:0034341)',
  2.5013799905103905e-05,
  27.019021739130434,
  286.29579393253863,
  ['IFITM1', 'GCH1', 'KYNU', 'CX3CL1'],
  0.005815708477936658,
  0,
  0],
 [3,
  'vesicle fusion (GO:0006906)',
  8.929816660000783e-05,
  19.18840579710145,
  178.9036694469556,
  ['STX12', 'SAMD9L', 'SAMD9', 'SYTL2'],
  0.013841215823001215,
  0,
  0],
 [4,
  'cellular response to interferon-gamma (GO:0071346)',
  0.0002013774170245257,
  15.402173913043478,
  131.07757831135987,
  ['MT2A', 'GBP2', 'CX3CL1', 'ICAM1'],
  0.016898612263532807,
  0,
  0],
 [5,
  'positive regulation of cytokine production (GO:0001819)',
  0.00021793610119514502,
  10.198966408268733,
  85.9906

### Two class comparison

##### NHBE

In [8]:
cols_healthy = dt.get_columns('NHBE', 'healthy')
cols_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels = [0] * len(cols_healthy) + [1] * len(cols_cov2)

In [9]:
nhbe_clf = xgb.XGBClassifier().fit(filtered_data_01.T[cols_healthy + cols_cov2].T, labels)



In [10]:
nhbe_features = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_01.columns), reverse = True))]

In [11]:
ea.getEnrichment(nhbe_features[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'regulation of nucleic acid-templated transcription (GO:1903506)',
  2.720042607545891e-15,
  18.489919354838708,
  620.1172977521386,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  2.7651500324713486e-13,
  0,
  0],
 [2,
  'regulation of cellular macromolecule biosynthetic process (GO:2000112)',
  5.3175962162910554e-15,
  17.74398450244698,
  583.2049359306333,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  2.7651500324713486e-13,
  0,
  0],
 [3,
  'regulation of gene expression (GO:0010468)',
  2.1264607740085488e-12,
  11.398314215096013,
  306.347500141178,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',

##### A549

In [12]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

a549_clf = xgb.XGBClassifier().fit(filtered_data_05.T[cols_healthy_A549 + cols_cov2_A549].T, labels)

a549_features = [x[1] for x in list(sorted(zip(a549_clf.feature_importances_, filtered_data_01.columns), reverse = True))]
ea.getEnrichment(a549_features[:50], 'GO_Biological_Process_2018')['GO_Biological_Process_2018']

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'regulation of nucleic acid-templated transcription (GO:1903506)',
  2.720042607545891e-15,
  18.489919354838708,
  620.1172977521386,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  2.7385620513898934e-13,
  0,
  0],
 [2,
  'regulation of cellular macromolecule biosynthetic process (GO:2000112)',
  5.3175962162910554e-15,
  17.74398450244698,
  583.2049359306333,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',
   'ZNF34',
   'ZNF419',
   'ZNF606',
   'ZNF549',
   'ZNF229',
   'ZNF239',
   'ZNF525',
   'ZNF544',
   'ZNF841',
   'ZNF235',
   'ZNF136',
   'ZNF773',
   'ZNF322',
   'ZFP28'],
  2.7385620513898934e-13,
  0,
  0],
 [3,
  'regulation of gene expression (GO:0010468)',
  2.1264607740085488e-12,
  11.398314215096013,
  306.347500141178,
  ['ZNF28',
   'ZNF583',
   'ZNF274',
   'ZNF160',