In [2]:
import sys
sys.path.append("../")

import dataInterpreter as dt
import enrichmentAnalysis as ea
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'

import xgboost as xgb
from sklearn.metrics import mean_squared_error

path = "C:\\Users\\Pedro\\Documents\\BicPAMS\\bicpams_5.1\\data\\latecovid\\"

In [2]:
filtered_data_01 = pd.read_csv(path + 'data-p01.csv', index_col = 0, sep = '\t').T
filtered_data_05 = pd.read_csv(path + 'data-p05.csv', index_col = 0, sep = '\t').T

In [3]:
classes = {'NHBE': {}, 'A549': {}, 'Calu3': {}, 'Biopsy': {}}
i = 0
for c in filtered_data_01.index:
    info = dt.get_info_from_name(c)
    if info['Condition'] not in classes[info['Cell Type']]:
        classes[info['Cell Type']][info['Condition']] = i
        i += 1
        
y = []
for c in filtered_data_01.index:
    info = dt.get_info_from_name(c)
    y.append(classes[info['Cell Type']][info['Condition']])
    
y_names = [' '] * 13
for c_type in classes:
    for cond in classes[c_type]:
        y_names[classes[c_type][cond]] = c_type + ' ' + cond


In [6]:
parameters = {
    'seed': 42, 
    'use_label_encoder': False,
    'booster': 'gbtree',
    'eta': 0.3,
    'gamma': 0,
    'alpha': 0,
    'n_estimators': 400,
    'eval_metric': 'mlogloss'
}

stats = dt.apply_loocv(filtered_data_01.values, np.array(y), xgb.XGBClassifier(**parameters))
stats

{'accuracy': 0.8333333333333334,
 'importances': array([1.51567477e-02, 1.87414878e-03, 2.25780616e-04, 1.53541223e-02,
        1.07414615e-03, 8.30417431e-03, 1.03555782e-03, 9.25054644e-03,
        2.27620066e-05, 2.35013421e-03, 3.03968835e-05, 3.90527527e-03,
        1.69028337e-02, 1.14295912e-03, 7.40122368e-02, 4.49351856e-04,
        2.07753115e-04, 7.52164859e-04, 2.17970493e-03, 1.02165082e-03,
        2.08289449e-04, 7.29739627e-04, 1.95741645e-02, 2.88464549e-03,
        0.00000000e+00, 1.92097387e-03, 1.28293403e-06, 1.17236435e-03,
        3.48453163e-04, 2.28825638e-04, 6.84369476e-03, 0.00000000e+00,
        2.59701949e-04, 2.11082198e-03, 9.35280095e-03, 0.00000000e+00,
        1.12821759e-05, 6.25204036e-04, 1.77100986e-04, 3.07144733e-03,
        1.88006312e-04, 1.69814823e-04, 9.93971402e-04, 4.14471061e-02,
        0.00000000e+00, 4.96382590e-06, 8.79292670e-04, 0.00000000e+00,
        2.55225588e-05, 9.36857860e-04, 0.00000000e+00, 1.93885027e-04,
        1.729243

In [7]:
clf = xgb.XGBClassifier(**parameters).fit(filtered_data_01, y)

list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True))

[(0.17600389, 'IL1A'),
 (0.08085336, 'NKAIN1'),
 (0.040582884, 'FSIP1'),
 (0.033085812, 'SAMD9L'),
 (0.025526734, 'EBI3'),
 (0.024465673, 'IFI16'),
 (0.024256032, 'CXCL1'),
 (0.023557836, 'NRTN'),
 (0.02271954, 'UBE2L6'),
 (0.020057864, 'FHL5'),
 (0.018765423, 'DCLK1'),
 (0.016079819, 'PHF11'),
 (0.0154207535, 'THEMIS2'),
 (0.015012367, 'GBP2'),
 (0.014706367, 'GFPT2'),
 (0.014257988, 'HCN4'),
 (0.012703476, 'HERC5'),
 (0.012473829, 'TNFRSF9'),
 (0.012232057, 'PDZK1IP1'),
 (0.011772433, 'SSH1'),
 (0.011768914, 'CFB'),
 (0.0113923615, 'CASP4'),
 (0.011376115, 'CLDN16'),
 (0.011316258, 'MOCOS'),
 (0.011273014, 'NMNAT1'),
 (0.011028717, 'SPATA21'),
 (0.010886526, 'KRT35'),
 (0.010886447, 'GCH1'),
 (0.010847612, 'KYNU'),
 (0.010838783, 'ICAM1'),
 (0.010381575, 'IFIT3'),
 (0.01029181, 'TLR2'),
 (0.010231779, 'FBLIM1'),
 (0.010216653, 'FHDC1'),
 (0.0100672785, 'CCBE1'),
 (0.0100054415, 'STX12'),
 (0.009745724, 'MT2A'),
 (0.009466907, 'SAA1'),
 (0.009324187, 'ZBTB25'),
 (0.00850817, 'CX3CL1')

In [17]:
selected_genes = [x[1] for x in list(sorted(zip(clf.feature_importances_, filtered_data_01.columns), reverse = True)) if x[0] > 0]
selected_genes

['IL1A',
 'NKAIN1',
 'FSIP1',
 'SAMD9L',
 'EBI3',
 'IFI16',
 'CXCL1',
 'NRTN',
 'UBE2L6',
 'FHL5',
 'DCLK1',
 'PHF11',
 'THEMIS2',
 'GBP2',
 'GFPT2',
 'HCN4',
 'HERC5',
 'TNFRSF9',
 'PDZK1IP1',
 'SSH1',
 'CFB',
 'CASP4',
 'CLDN16',
 'MOCOS',
 'NMNAT1',
 'SPATA21',
 'KRT35',
 'GCH1',
 'KYNU',
 'ICAM1',
 'IFIT3',
 'TLR2',
 'FBLIM1',
 'FHDC1',
 'CCBE1',
 'STX12',
 'MT2A',
 'SAA1',
 'ZBTB25',
 'CX3CL1',
 'C22orf24',
 'RGS17',
 'TRANK1',
 'GRID1',
 'PTPRH',
 'IFITM1',
 'IFI44',
 'SAMD9',
 'FAM49A',
 'SYTL2',
 'SLC6A4',
 'ADD2',
 'PCK2',
 'CXCL5',
 'ANKRD26P1',
 'CATSPER2P1',
 'MCAM',
 'IRF2BPL',
 'IGFBP1',
 'PPP1R16B',
 'ARHGEF4',
 'ZNF862',
 'LOC254896',
 'PITPNA-AS1',
 'MOBP',
 'IRF7',
 'IFIT2',
 'CCL20',
 'GPSM3',
 'MYPN',
 'HBEGF',
 'TRABD2A',
 'ADORA2A',
 'ZNF239',
 'CCL5',
 'PLIN2',
 'JUN',
 'BMPER',
 'TFF1',
 'STC2',
 'FOXD1',
 'RPL36A',
 'MAP1B',
 'NMI',
 'PARP10',
 'C3AR1',
 'MB',
 'CCL2',
 'XAF1',
 'HSF2BP',
 'REEP2',
 'PI3',
 'DAW1',
 'TCTE3']

In [18]:
results = ea.getEnrichment(selected_genes, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'response to interferon-gamma (GO:0034341)',
  1.3336127778476142e-10,
  29.579950289975145,
  672.5877056730069,
  ['IFITM1',
   'GCH1',
   'CCL20',
   'KYNU',
   'CCL5',
   'CCL2',
   'GBP2',
   'CX3CL1',
   'TLR2'],
  1.19625066172931e-07,
  0,
  0],
 [2,
  'cytokine-mediated signaling pathway (GO:0019221)',
  5.636790875266018e-10,
  7.581696779261587,
  161.46387855314202,
  ['IFITM1',
   'CCL20',
   'TNFRSF9',
   'EBI3',
   'CXCL1',
   'CX3CL1',
   'CXCL5',
   'IFIT3',
   'IFIT2',
   'ICAM1',
   'IL1A',
   'MT2A',
   'CCL5',
   'IRF7',
   'SAA1',
   'CCL2',
   'GBP2',
   'XAF1'],
  2.528100707556809e-07,
  0,
  0],
 [3,
  'cellular response to interferon-gamma (GO:0071346)',
  5.5515596247830145e-09,
  18.71281512605042,
  355.7154008147536,
  ['MT2A', 'CCL20', 'CCL5', 'IRF7', 'CCL2', 'GBP2', 'CX3CL1', 'TLR2', 'ICAM1'],
  1.6599163278101214e-06,
  0,
  0],
 [4,
  'neutrophil chemotaxis (GO:0030593)',
  3.8032671024636885e-08,
  25.34227330779055,
  432.9681850211753,
  ['C

In [8]:
import json

with open('results_xGBoost.json', 'w') as file:
     #file.write(json.dumps(results)) # use `json.loads` to do the reverse

NameError: name 'results' is not defined

In [9]:
import json

with open('results_xGBoost.json') as file:
    results = json.load(file)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [20]:
dataset = {'p-value': [], 'Score': []}
index = []

for term in results:
    index += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
enrichment_dataset = pd.DataFrame(dataset, index = index)

In [21]:
pd.set_option("display.max_rows", None)
selection = enrichment_dataset[enrichment_dataset['p-value'] < 0.01].sort_values('Score', ascending = False).head(25)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('xGBoost_table.csv')
selection

Unnamed: 0,p-value,Score
ISG15-protein conjugation (GO:0032020),0.00761,869.12
macrophage chemotaxis (GO:0048246),0.0012,783.48
response to interferon-gamma (GO:0034341),1.2e-07,672.59
nicotinamide nucleotide biosynthetic process (GO:0019359),0.00989,666.41
regulation of natural killer cell chemotaxis (GO:2000501),0.00989,666.41
macrophage migration (GO:1905517),0.002,548.36
eosinophil migration (GO:0072677),0.00201,495.85
eosinophil chemotaxis (GO:0048245),0.00201,495.85
lymphocyte migration (GO:0072676),8.21e-05,435.11
neutrophil chemotaxis (GO:0030593),8.53e-06,432.97


### Two class comparison

##### NHBE

In [22]:
cols_nhbe_healthy = dt.get_columns('NHBE', 'healthy')
cols_nhbe_cov2 = dt.get_columns('NHBE', 'sars-cov2')

labels_nhbe = [0] * len(cols_nhbe_healthy) + [1] * len(cols_nhbe_cov2)

data_nhbe = dt.get_data('NHBE', 'healthy', 'sars-cov2')

filtered_data_NHBE = dt.get_p_values('mannwhitneyu', data_nhbe, cols_nhbe_healthy, cols_nhbe_cov2)
filtered_data_NHBE.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_NHBE.index)

912

In [23]:
parameters = {
    'seed': 42, 
    'use_label_encoder': False,
    'booster': 'gbtree',
    'eta': 0.3,
    'gamma': 0,
    'alpha': 0,
    'n_estimators': 200,
    'eval_metric': 'mlogloss',
}

stats_NHBE = dt.apply_loocv(filtered_data_NHBE.T.values, np.array(labels_nhbe), xgb.XGBClassifier(**parameters))
stats_NHBE

{'accuracy': 0.7,
 'importances': array([0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 

In [24]:
nhbe_clf = xgb.XGBClassifier(**parameters).fit(filtered_data_NHBE.T, labels_nhbe)

list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.T.columns), reverse = True))

[(0.42095545, 'RNF223'),
 (0.21817274, 'CLCN6'),
 (0.17672779, 'PDZK1IP1'),
 (0.17444858, 'S100A9'),
 (0.009695429, 'HIF1A-AS2'),
 (0.0, 'ZSWIM4'),
 (0.0, 'ZP3'),
 (0.0, 'ZNRF3-AS1'),
 (0.0, 'ZNF792'),
 (0.0, 'ZNF726'),
 (0.0, 'ZNF697'),
 (0.0, 'ZNF676'),
 (0.0, 'ZNF625'),
 (0.0, 'ZNF578'),
 (0.0, 'ZNF483'),
 (0.0, 'ZNF439'),
 (0.0, 'ZNF319'),
 (0.0, 'ZNF300P1'),
 (0.0, 'ZNF275'),
 (0.0, 'ZNF252P-AS1'),
 (0.0, 'ZNF169'),
 (0.0, 'ZMYND12'),
 (0.0, 'ZG16B'),
 (0.0, 'ZG16'),
 (0.0, 'ZFP57'),
 (0.0, 'ZFP42'),
 (0.0, 'ZDHHC23'),
 (0.0, 'ZC3H12C'),
 (0.0, 'ZC3H12A'),
 (0.0, 'YBX2'),
 (0.0, 'XDH'),
 (0.0, 'XAF1'),
 (0.0, 'WWC1'),
 (0.0, 'WTAPP1'),
 (0.0, 'WNT4'),
 (0.0, 'WNT16'),
 (0.0, 'WNT11'),
 (0.0, 'WNK3'),
 (0.0, 'WNK2'),
 (0.0, 'WFDC5'),
 (0.0, 'WDR86-AS1'),
 (0.0, 'WAS'),
 (0.0, 'VNN3'),
 (0.0, 'VNN2'),
 (0.0, 'VNN1'),
 (0.0, 'VMO1'),
 (0.0, 'VEGFA'),
 (0.0, 'VAV1'),
 (0.0, 'USP54'),
 (0.0, 'USP35'),
 (0.0, 'USP18'),
 (0.0, 'USB1'),
 (0.0, 'UPK1A'),
 (0.0, 'UNKL'),
 (0.0, 'UGDH-AS1'),

In [25]:
selected_genes_nhbe = [x[1] for x in list(sorted(zip(nhbe_clf.feature_importances_, filtered_data_NHBE.T.columns), reverse = True)) if x[0] > 0]
selected_genes_nhbe

['RNF223', 'CLCN6', 'PDZK1IP1', 'S100A9', 'HIF1A-AS2']

In [26]:
results_nhbe = ea.getEnrichment(selected_genes_nhbe, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_nhbe

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'regulation of integrin biosynthetic process (GO:0045113)',
  0.0012494733811348967,
  1249.4375,
  8352.531058286915,
  ['S100A9'],
  0.01320794324536269,
  0,
  0],
 [2,
  'peptidyl-cysteine S-nitrosylation (GO:0018119)',
  0.0012494733811348967,
  1249.4375,
  8352.531058286915,
  ['S100A9'],
  0.01320794324536269,
  0,
  0],
 [3,
  'chronic inflammatory response (GO:0002544)',
  0.0012494733811348967,
  1249.4375,
  8352.531058286915,
  ['S100A9'],
  0.01320794324536269,
  0,
  0],
 [4,
  'astrocyte development (GO:0014002)',
  0.0014992198216206949,
  999.5,
  6499.5590198897235,
  ['S100A9'],
  0.01320794324536269,
  0,
  0],
 [5,
  'regulation of macromolecule biosynthetic process (GO:0010556)',
  0.0014992198216206949,
  999.5,
  6499.5590198897235,
  ['S100A9'],
  0.01320794324536269,
  0,
  0],
 [6,
  'astrocyte differentiation (GO:0048708)',
  0.0017489165149326186,
  832.875,
  5287.722499803602,
  ['S100A9'],
  0.01320794324536269,
  0,
  0],
 [7,
  'leukocyte aggre

In [27]:
import json

with open('results_xGBoost_NHBE.json', 'w') as file:
     #file.write(json.dumps(results_nhbe)) # use `json.loads` to do the reverse

In [8]:
import json

with open('results_xGBoost_NHBE.json') as file:
    results_nhbe = json.load(file)

In [28]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_nhbe = []

cols_nhbe_healthy = dt.get_columns('NHBE', 'healthy')
cols_nhbe_cov2 = dt.get_columns('NHBE', 'sars-cov2')

data_nhbe = dt.get_data('NHBE', 'healthy', 'sars-cov2')

for term in results_nhbe:
    index_nhbe += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_nhbe.loc[genes, cols_nhbe_healthy].values, axis = 1) - np.mean(data_nhbe.loc[genes, cols_nhbe_cov2].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
    
enrichment_nhbe_dataset = pd.DataFrame(dataset, index = index_nhbe)

In [32]:
pd.set_option("display.max_rows", None)
selection = enrichment_nhbe_dataset[enrichment_nhbe_dataset['p-value'] < 0.05].sort_values('Score', ascending = False)#.head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('NHBE_xGBoost_table.csv')
selection

Unnamed: 0,p-value,Score,Value
regulation of integrin biosynthetic process (GO:0045113),0.0132,8352.53,"1 up, 0 down"
chronic inflammatory response (GO:0002544),0.0132,8352.53,"1 up, 0 down"
peptidyl-cysteine S-nitrosylation (GO:0018119),0.0132,8352.53,"1 up, 0 down"
astrocyte development (GO:0014002),0.0132,6499.56,"1 up, 0 down"
regulation of macromolecule biosynthetic process (GO:0010556),0.0132,6499.56,"1 up, 0 down"
astrocyte differentiation (GO:0048708),0.0132,5287.72,"1 up, 0 down"
leukocyte aggregation (GO:0070486),0.0132,4436.86,"1 up, 0 down"
peptidyl-cysteine modification (GO:0018198),0.0132,3808.55,"1 up, 0 down"
defense response to fungus (GO:0050832),0.0305,1111.12,"1 up, 0 down"
glial cell development (GO:0021782),0.0305,1006.18,"1 up, 0 down"


##### A549

In [33]:
cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

labels_a549 = [0] * len(cols_healthy_A549) + [1] * len(cols_cov2_A549)

data_a549 = dt.get_data('A549', 'healthy', 'sars-cov2')

filtered_data_a549 = dt.get_p_values('mannwhitneyu', data_a549, cols_healthy_A549, cols_cov2_A549, limit = 0.01)
filtered_data_a549.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_a549)

5528

In [72]:
parameters = {
    'seed': 42, 
    'use_label_encoder': False,
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 0,
    'alpha': 0,
    'n_estimators': 300,
    'eval_metric': 'mlogloss',
}

stats_a549 = dt.apply_loocv(filtered_data_a549.T.values, np.array(labels_a549), xgb.XGBClassifier(**parameters))
stats_a549

{'accuracy': 0.7894736842105263,
 'importances': array([0.00914982, 0.        , 0.02022538, ..., 0.        , 0.        ,
        0.        ])}

In [69]:
a549_clf = xgb.XGBClassifier(**parameters).fit(filtered_data_a549.T, labels_a549)

list(sorted(zip(a549_clf.feature_importances_, filtered_data_a549.T.columns), reverse = True))

[(0.22577429, 'GBP1'),
 (0.0697644, 'DMC1'),
 (0.06970843, 'IL15RA'),
 (0.069486104, 'CYB5R2'),
 (0.06559767, 'TNFRSF9'),
 (0.058563143, 'NKAIN1'),
 (0.044358764, 'C3AR1'),
 (0.04350054, 'NALCN'),
 (0.043478355, 'SLC2A1'),
 (0.043475352, 'TGFBR3L'),
 (0.04291595, 'CD274'),
 (0.040262137, 'THEMIS2'),
 (0.040077094, 'FLJ37201'),
 (0.03792867, 'LOC374443'),
 (0.036975276, 'RAB3B'),
 (0.0360612, 'RHBDL2'),
 (0.014611983, 'DPEP1'),
 (0.014435125, 'GPR87'),
 (0.003025485, 'ST6GALNAC3'),
 (0.0, 'ZZZ3'),
 (0.0, 'ZYX'),
 (0.0, 'ZYG11B'),
 (0.0, 'ZYG11A'),
 (0.0, 'ZXDC'),
 (0.0, 'ZXDB'),
 (0.0, 'ZXDA'),
 (0.0, 'ZSWIM4'),
 (0.0, 'ZSWIM3'),
 (0.0, 'ZSCAN9'),
 (0.0, 'ZSCAN32'),
 (0.0, 'ZSCAN31'),
 (0.0, 'ZSCAN29'),
 (0.0, 'ZSCAN26'),
 (0.0, 'ZSCAN21'),
 (0.0, 'ZSCAN20'),
 (0.0, 'ZSCAN12'),
 (0.0, 'ZRANB3'),
 (0.0, 'ZRANB2-AS2'),
 (0.0, 'ZRANB1'),
 (0.0, 'ZNRF2'),
 (0.0, 'ZNHIT6'),
 (0.0, 'ZNHIT3'),
 (0.0, 'ZNFX1'),
 (0.0, 'ZNF92'),
 (0.0, 'ZNF879'),
 (0.0, 'ZNF860'),
 (0.0, 'ZNF853'),
 (0.0, 'ZNF85

In [70]:
selected_genes_a549 = [x[1] for x in list(sorted(zip(a549_clf.feature_importances_, filtered_data_a549.T.columns), reverse = True)) if x[0] > 0]
selected_genes_a549

['GBP1',
 'DMC1',
 'IL15RA',
 'CYB5R2',
 'TNFRSF9',
 'NKAIN1',
 'C3AR1',
 'NALCN',
 'SLC2A1',
 'TGFBR3L',
 'CD274',
 'THEMIS2',
 'FLJ37201',
 'LOC374443',
 'RAB3B',
 'RHBDL2',
 'DPEP1',
 'GPR87',
 'ST6GALNAC3']

In [71]:
results_a549 = ea.getEnrichment(selected_genes_a549, 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_a549

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'glucose import across plasma membrane (GO:0098708)',
  0.00474139160939957,
  277.4583333333333,
  1484.7973498875915,
  ['SLC2A1'],
  0.06779953883889153,
  0,
  0],
 [2,
  'disaccharide biosynthetic process (GO:0046351)',
  0.00474139160939957,
  277.4583333333333,
  1484.7973498875915,
  ['SLC2A1'],
  0.06779953883889153,
  0,
  0],
 [3,
  'double-strand break repair via synthesis-dependent strand annealing (GO:0045003)',
  0.00474139160939957,
  277.4583333333333,
  1484.7973498875915,
  ['DMC1'],
  0.06779953883889153,
  0,
  0],
 [4,
  'regulation of resting membrane potential (GO:0060075)',
  0.005687115738983334,
  221.95555555555555,
  1147.4107993095174,
  ['NALCN'],
  0.06779953883889153,
  0,
  0],
 [5,
  'negative regulation of CD8-positive, alpha-beta T cell activation (GO:2001186)',
  0.005687115738983334,
  221.95555555555555,
  1147.4107993095174,
  ['CD274'],
  0.06779953883889153,
  0,
  0],
 [6,
  'glycosylceramide metabolic process (GO:0006677)',
  0.005687

In [38]:
import json

with open('results_xGBoost_A549.json', 'w') as file:
     #file.write(json.dumps(results_a549)) # use `json.loads` to do the reverse

In [18]:
import json

with open('results_xGBoost_A549.json') as file:
    results_a549 = json.load(file)

In [39]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_a549 = []

cols_healthy_A549 = dt.get_columns('A549', 'healthy')
cols_cov2_A549 = dt.get_columns('A549', 'sars-cov2')

data_a549 = dt.get_data('A549', 'healthy', 'sars-cov2')

for term in results_a549:
    index_a549 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_a549.loc[genes, cols_healthy_A549].values, axis = 1) - np.mean(data_a549.loc[genes, cols_cov2_A549].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
enrichment_a549_dataset = pd.DataFrame(dataset, index = index_a549)

In [42]:
pd.set_option("display.max_rows", None)
selection = enrichment_a549_dataset[enrichment_a549_dataset['p-value'] < 0.05].sort_values('Score', ascending = False).head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('A549_xGBoost_table.csv')
selection

Unnamed: 0,p-value,Score
negative regulation of substrate adhesion-dependent cell spreading (GO:1900025),0.0149,3304.67
negative regulation of cell morphogenesis involved in differentiation (GO:0010771),0.0149,3304.67
protein localization to vacuole (GO:0072665),0.0149,3012.37
regulation of lymphocyte activation (GO:0051249),0.0149,2764.28
negative regulation of T cell receptor signaling pathway (GO:0050860),0.0149,2062.22
regulation of protein localization to cell periphery (GO:1904375),0.0149,1935.63
negative regulation of protein localization to plasma membrane (GO:1903077),0.0149,1822.54
negative regulation of protein localization to cell periphery (GO:1904376),0.0149,1822.54
negative regulation of interleukin-2 production (GO:0032703),0.0149,1720.94
negative regulation of antigen receptor-mediated signaling pathway (GO:0050858),0.0149,1470.2


##### Calu3

In [49]:
cols_healthy_Calu3 = dt.get_columns('Calu3', 'healthy')
cols_cov2_Calu3 = dt.get_columns('Calu3', 'sars-cov2')

labels_calu3 = [0] * len(cols_healthy_Calu3) + [1] * len(cols_cov2_Calu3)

data_calu3 = dt.get_data('Calu3', 'healthy', 'sars-cov2')

filtered_data_calu3 = dt.get_p_values('mannwhitneyu', data_calu3, cols_healthy_Calu3, cols_cov2_Calu3)
filtered_data_calu3.drop(['p-value'], axis = 1, inplace = True)

len(filtered_data_calu3.index)

1014

In [86]:
parameters = {
    'seed': 42, 
    'use_label_encoder': False,
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 0,
    'alpha': 0,
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
}

stats_calu3 = dt.apply_loocv(filtered_data_calu3.T.values, np.array(labels_calu3), xgb.XGBClassifier(**parameters))
stats_calu3

{'accuracy': 0.0, 'importances': array([nan, nan, nan, ..., nan, nan, nan])}

In [85]:
calu3_clf = xgb.XGBClassifier(**parameters).fit(filtered_data_calu3.T, labels_calu3)

list(sorted(zip(calu3_clf.feature_importances_, filtered_data_calu3.T.columns), reverse = True))

  return all_features / all_features.sum()


[(nan, 'ISG15'),
 (nan, 'MIR200A'),
 (nan, 'TNFRSF9'),
 (nan, 'CA6'),
 (nan, 'SLC2A5'),
 (nan, 'NPPA-AS1'),
 (nan, 'TNFRSF8'),
 (nan, 'AGMAT'),
 (nan, 'LOC100506801'),
 (nan, 'NUDC'),
 (nan, 'THEMIS2'),
 (nan, 'PTAFR'),
 (nan, 'RAB42'),
 (nan, 'DNALI1'),
 (nan, 'EDN2'),
 (nan, 'RIMKLA'),
 (nan, 'DMBX1'),
 (nan, 'SLC5A9'),
 (nan, 'FAM151A'),
 (nan, 'DHCR24'),
 (nan, 'PCSK9'),
 (nan, 'JUN'),
 (nan, 'PDE4B'),
 (nan, 'GADD45A'),
 (nan, 'CTH'),
 (nan, 'CRYZ'),
 (nan, 'ST6GALNAC5'),
 (nan, 'DNAJB4'),
 (nan, 'PTGFR'),
 (nan, 'IFI44L'),
 (nan, 'IFI44'),
 (nan, 'GBP1'),
 (nan, 'GBP4'),
 (nan, 'GBP5'),
 (nan, 'GBP6'),
 (nan, 'GBP1P1'),
 (nan, 'GFI1'),
 (nan, 'F3'),
 (nan, 'PALMD'),
 (nan, 'VCAM1'),
 (nan, 'OLFM3'),
 (nan, 'VAV3'),
 (nan, 'GNAT2'),
 (nan, 'CSF1'),
 (nan, 'UBL4B'),
 (nan, 'SPAG17'),
 (nan, 'TBX15'),
 (nan, 'TXNIP'),
 (nan, 'PDZK1'),
 (nan, 'ANXA9'),
 (nan, 'TUFT1'),
 (nan, 'NPR1'),
 (nan, 'KCNN3'),
 (nan, 'FDPS'),
 (nan, 'MSTO1'),
 (nan, 'IFI16'),
 (nan, 'APCS'),
 (nan, 'VSIG8'),


In [55]:
xgb.XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [36]:
selected_genes_a549 = [x[1] for x in list(sorted(zip(a549_clf.feature_importances_, filtered_data_a549.T.columns), reverse = True)) if x[0] > 0]
selected_genes_a549

['GBP1', 'TNFRSF9', 'NKAIN1', 'THEMIS2']

In [127]:
results_calu3 = ea.getEnrichment(calu3_features[:92], 'GO_Biological_Process_2021')['GO_Biological_Process_2021']
results_calu3

Order of returned results is: Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, Adjusted p-value, Old p-value, Old adjusted p-value


[[1,
  'negative regulation of receptor signaling pathway via JAK-STAT (GO:0046426)',
  5.050773095191729e-05,
  51.58599827139153,
  510.3600974147166,
  ['SOCS1', 'BCL3', 'PARP14'],
  0.042224463075802854,
  0,
  0],
 [2,
  'defense response to symbiont (GO:0140546)',
  0.000270058490856001,
  9.557133198106829,
  78.52974007666207,
  ['ZBP1', 'IFIH1', 'STAT2', 'EIF2AK2', 'IFIT2'],
  0.08445343429239259,
  0,
  0],
 [3,
  'defense response to virus (GO:0051607)',
  0.0003732114055396177,
  8.88110632183908,
  70.10181849122506,
  ['ZBP1', 'IFIH1', 'STAT2', 'EIF2AK2', 'IFIT2'],
  0.08445343429239259,
  0,
  0],
 [4,
  'B cell proliferation (GO:0042100)',
  0.0004241446308301244,
  23.106160402944596,
  179.42941098047973,
  ['CD79A', 'CD70', 'RASGRP1'],
  0.08445343429239259,
  0,
  0],
 [5,
  'B cell activation (GO:0042113)',
  0.0006387233646400902,
  11.126262626262626,
  81.8452230993444,
  ['CD79A', 'CD70', 'NFAM1', 'RASGRP1'],
  0.08445343429239259,
  0,
  0],
 [6,
  'regulation

In [128]:
import json

with open('results_xGBoost_Calu3.json', 'w') as file:
     file.write(json.dumps(results_calu3)) # use `json.loads` to do the reverse

In [3]:
import json

with open('results_xGBoost_Calu3.json') as file:
    results_calu3 = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'results_xGBoost_Calu3.json'

In [1]:
dataset = {'p-value': [], 'Score': [], 'Value': []}
index_calu3 = []

cols_healthy_Calu3 = dt.get_columns('Calu3', 'healthy')
cols_cov2_Calu3 = dt.get_columns('Calu3', 'sars-cov2')

data_calu3 = dt.get_data('Calu3', 'healthy', 'sars-cov2')

for term in results_calu3:
    index_calu3 += [term[1]]
    dataset['p-value'] += [term[6]]
    dataset['Score'] += [term[4]]
    
    genes = term[5]
    avg_sub = np.mean(data_calu3.loc[genes, cols_healthy_Calu3].values, axis = 1) - np.mean(data_calu3.loc[genes, cols_cov2_Calu3].values, axis = 1)

    downs = 0
    ups = 0

    for e in avg_sub:
        if e > 0:
            downs += 1
        elif e < 0:
            ups += 1

    dataset['Value'] += ['%d up, %d down' % (ups, downs)]
    
enrichment_calu3_dataset = pd.DataFrame(dataset, index = index_calu3)

NameError: name 'dt' is not defined

In [130]:
pd.set_option("display.max_rows", None)
selection = enrichment_calu3_dataset[enrichment_calu3_dataset['p-value'] < 0.05].sort_values('Score', ascending = False).head(25)[['p-value', 'Score']]

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('Calu3_xGBoost_table.csv')
selection

Unnamed: 0,p-value,Score
negative regulation of receptor signaling pathway via JAK-STAT (GO:0046426),0.0422,510.36
