In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import mygene

In [132]:
path = os.getcwd()

In [154]:
os.listdir(path+'/raw data')

['GSE157240_series_matrix.txt',
 'GSE157240_sbst1_norm_RVI_Tsalik_020420.txt',
 'GSE172114_series_matrix.txt',
 'GSE172114_rsem_gene_count_matrix_TMM_69samples.csv']

# Data Transformation

## GSE157240 - Respiratory Disease

In [149]:
resp_df = pd.read_csv(path+'/raw data/GSE157240_sbst1_norm_RVI_Tsalik_020420.txt', sep = ' ')
resp_df.rename(columns = {"gene_symbol":"genes"}, inplace = True)
resp_df = resp_df[['genes']+[x for x in resp_df.columns if "DU" in x]]
resp_df.to_csv(path+'/normalized data/respiratory_data.csv', index = False)

In [139]:
resp_meta = pd.read_csv(path+'/raw data/GSE157240_series_matrix.txt', delimiter = '\t', header = None)
resp_meta = resp_meta.T
resp_meta.columns = list(resp_meta.iloc[[0]].values)
resp_meta.drop([0], axis = 0, inplace = True)
resp_meta.drop(columns = resp_meta.columns.to_list()[2:-2], inplace = True)
resp_meta = resp_meta.iloc[:, :-1]
resp_meta.rename(columns = {'Sample_title':'sample', 'Sample_geo_accession':'geo_sample',\
                           'Sample_characteristics_ch1':'condition'}, inplace = True)
resp_meta.to_csv(path+'/normalized data/respiratory_metadata.csv', index = False)

resp_meta = pd.read_csv(path+'/normalized data/respiratory_metadata.csv')
resp_meta['condition'] = [x.split("status: ")[1] for x in resp_meta.condition]
resp_meta.to_csv(path+'/normalized data/respiratory_metadata.csv', index = False)

In [191]:
resp_meta.loc[resp_meta['condition'] == \
              'DNA virus infected patient (Adenovirus, Cytomegalovirus, Ebstein-Barr virus, Herpes Simplex virus)'\
              , 'condition'] = 'Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus'
resp_meta.loc[resp_meta.condition == 'Dengue virus infected patient', 'condition'] = 'Dengue'
resp_meta.loc[resp_meta.condition == 'Entero/Rhinovirus infected patient', 'condition'] = 'Rhinovirus'
resp_meta.loc[resp_meta.condition == 'Influenza virus infected patient', 'condition'] = 'Influenza'
resp_meta.loc[resp_meta.condition == 'Metapneumovirus infected patient', 'condition'] = 'Pn'
resp_meta.loc[resp_meta.condition == 'Dengue virus infected patient', 'condition'] = 'Dengue'

In [192]:
np.unique(resp_meta.condition)

array(['Adenovirus_Cytomegalovirus_Ebstein-Barr virus_Herpes Simplex virus',
       'Dengue virus infected patient',
       'Entero/Rhinovirus infected patient',
       'Influenza virus infected patient',
       'Metapneumovirus infected patient', 'healthy_ctrl',
       'other respiratory RNA virus infected patient (Parainfluenza virus and Respiratory Syncytial virus)'],
      dtype=object)

## GSE172114 - COVID19

In [187]:
covid_df = pd.read_csv(path+'/raw data/GSE172114_rsem_gene_count_matrix_TMM_69samples.csv')

covid_df.rename(columns = {"Unnamed: 0":"genes"}, inplace = True)
covid_df['genes'] = [x.split("_")[1] for x in covid_df.genes]
covid_df.to_csv(path+'/normalized data/covid_data.csv', index = False)

In [185]:
covid_meta = pd.read_csv(path+'/raw data/GSE172114_series_matrix.txt', delimiter = "\t", header = None)

covid_meta = covid_meta.T
covid_meta.columns = list(covid_meta.iloc[[0]].values)
covid_meta.drop([0], axis = 0, inplace = True)
covid_meta = covid_meta.iloc[:, :-1]
covid_meta.rename(columns = {'Sample_title':'sample', 'Sample_geo_accession':'geo_sample',\
                           'Sample_characteristics_ch1':'condition'}, inplace = True)
covid_meta.to_csv(path+'/normalized data/covid_metadata.csv', index = False)

covid_meta = pd.read_csv(path+'/normalized data/covid_metadata.csv')

covid_meta['condition'] = [x.split(": ")[1] for x in covid_meta.condition]
covid_meta.to_csv(path+'/normalized data/covid_metadata.csv', index = False)