In [5]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import scipy
import itertools
import pickle 
from statsmodels.stats.multitest import fdrcorrection as fdr

This script focuses on performing differential gene expression analysis on the data corresponding to different respiratory diseases. The genes that are not satistically significant are filtered out for the next step of analysis and to reduce the size of the network 

In [2]:
path = os.getcwd()+'/normalized data/'
os.listdir(path)

['respiratory_data.csv',
 'covid_data.csv',
 'genes_of_interest.csv',
 'respiratory_conditions_degs.pkl',
 'respiratory_metadata.csv',
 'covid_metadata.csv']

## DEGs Function

In [39]:
def calculate_degs(mapped_omics, sample, combo_func = 'mean', test_type = 'student_t'):
    from scipy.stats import gmean
    from scipy.stats import ttest_ind
    from scipy.stats import mannwhitneyu
    from operator import truediv
    import itertools

    samples = pd.read_csv(sample_file, sep=',', header=0, index_col=0, dtype=str)
    groups = np.unique(samples['condition'])
    print('Found condition groups', groups)
    out = dict()
    for (i,j) in itertools.product(groups, groups):
        if i==j: continue
        else: 
            print('calculating degs for ({},{})'.format(i,j))
            # extracting gene expression values of all samples of a given condition
            geneexp_i = mapped_omics[samples[samples['condition']==i].index].astype('float64').values
            geneexp_j = mapped_omics[samples[samples['condition']==j].index].astype('float64').values
        
            geo_mean_i = []
            geo_mean_j = []
        
            # combining gene expression values of samples with the same condition
            if combo_func == 'gmean':
                if np.count_nonzero(geneexp_i) != geneexp_i.shape[0]*geneexp_i.shape[1] or \
                np.count_nonzero(geneexp_j) != geneexp_j.shape[0]*geneexp_j.shape[1]:
                    print('Genes with zero counts encountered. Cannot calculate geometric mean')
                    return 
                geo_mean_i = [gmean(geneexp_i[r]) for r in range(0, geneexp_i.shape[0])]
                geo_mean_j = [gmean(geneexp_j[r]) for r in range(0, geneexp_j.shape[0])]
            if combo_func == 'mean':
                geo_mean_i = [np.mean(geneexp_i[r]) for r in range(0, geneexp_i.shape[0])]
                geo_mean_j = [np.mean(geneexp_j[r]) for r in range(0, geneexp_j.shape[0])]
            if combo_func == 'median':
                geo_mean_i = [np.median(geneexp_i[r]) for r in range(0, geneexp_i.shape[0])]
                geo_mean_j = [np.median(geneexp_j[r]) for r in range(0, geneexp_j.shape[0])]
            
            # calculating log fold change
            fc_vals = list(map(truediv, geo_mean_j, geo_mean_i))
            log2fcval = np.log2(fc_vals)
        
            # performing hypothesis tests
            pval = np.zeros(len(fc_vals))
            test_statistic = np.zeros(len(fc_vals))
        
            for idx in range(len(fc_vals)):
                if test_type == 'student_t':
                    statistic, pvalue = ttest_ind(geneexp_i[idx,:], geneexp_j[idx,:], equal_var = False)
                if test_type == 'mann_u':
                    statistic, pvalue = mannwhitneyu(geneexp_i[idx,:], geneexp_j[idx,:])
                test_statistic[idx] = statistic
                pval[idx] = pvalue
        
            out[(i,j)] = pd.DataFrame({'genes':mapped_omics.genes, 'statistic':test_statistic, 'pvalue': pval})
    return out

In [89]:

l = fdr(temp[('healthy_ctrl','Pneumonia')].pvalue, alpha = 0.001)

In [90]:
np.count_nonzero(l[0])

3350

## GSE157240 - Respiratory Disease

Each condition is compared with healthy controls and statistically significant genes are extracted after applying FDR correction with alpha = 0.0001

Alpha is set low as very high number of hypothesis tests are conducted (approximately equal to 20,000)

In [91]:
resp_df = pd.read_csv(path+'respiratory_data.csv')
resp_meta = path+'respiratory_metadata.csv'
resp_degs = calculate_degs(resp_df, resp_meta, test_type = 'student_t')

Found condition groups ['Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus'
 'Dengue' 'Influenza' 'Parainfluenza_RespiratorySyncytial' 'Pneumonia'
 'Rhinovirus' 'healthy_ctrl']
calculating degs for (Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus,Dengue)
calculating degs for (Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus,Influenza)
calculating degs for (Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus,Parainfluenza_RespiratorySyncytial)
calculating degs for (Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus,Pneumonia)
calculating degs for (Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus,Rhinovirus)
calculating degs for (Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus,healthy_ctrl)
calculating degs for (Dengue,Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus)
calculating degs for (Dengue,Influenza)
calculating degs for (Dengue,Parainfluenza_

In [134]:
pickle.dump(resp_degs, open(path+'respiratory_conditions_degs.pkl','wb'))

In [41]:
covid_df = pd.read_csv(path+'covid_data.csv')

In [38]:
keys_oi = [x for x in resp_degs.keys() if x[0] == 'healthy_ctrl']
genes_oi = []
for key in keys_oi:
#     print(key)
    significant_idx = fdr(resp_degs[key].pvalue, alpha = 0.01)[0]
#     print(len(np.where(significant_idx)[0]))
    genes_oi.append(list(resp_df['genes'][np.where(significant_idx)[0]].values))

genes_oi = list(set.intersection(*map(set,genes_oi)))
genes_oi = list(set(genes_oi).intersection(set(covid_df['genes'])))

In [40]:
pd.Series(genes_oi).to_csv(path+'genes_of_interest.csv', index = False)