In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import openpyxl

In [2]:
def set_up_structure_for_output_files():
    datasets = pd.read_table('../data/UR_analysis/file_names.txt', header = None)
    URs = pd.read_table('../data/UR_analysis/URs.txt', header = None)[0]
    activity = pd.read_table('../data/UR_analysis/activity.txt', header = None)
    Datasets_Inf = pd.read_table('../data/UR_analysis/Datasets_Inf.txt', sep = ',', header = None)
    Datasets_Noninf = pd.read_table('../data/UR_analysis/Datasets_Noninf.txt', sep = ',', header = None)
    
    activity.loc[21] = 'duplicate'
    Datasets_Inf.index = URs
    Datasets_Inf.columns = np.array(datasets)[activity == 'yes']
    Datasets_Noninf.index = URs
    Datasets_Noninf.columns = np.array(datasets)[activity == 'no']
    return Datasets_Inf, Datasets_Noninf


In [3]:
def preprocess_the_input(URs_all_diseases):
    # Subset to only P1 and only those URs that are significant in at least 1 disease
    URs_all_diseases = URs_all_diseases[URs_all_diseases['SP'] == '1.6']
    IMID_count = list()
    for i in range(len(URs_all_diseases)):
        IMID_count.append(sum(URs_all_diseases.iloc[i,8:] < 0.05))
    URs_all_diseases = URs_all_diseases[np.array(IMID_count) > 0]

    URs_all_diseases = URs_all_diseases.sort_values(by = 'UC_active')
    return URs_all_diseases


In [4]:
def logFC_analysis(Datasets_Inf, Datasets_Noninf, path_DEGs):
    ## logFC_analysis
    translation = pd.read_table('../data/UR_analysis/gene_info.txt', sep = ',')
    URs = pd.read_table('../data/UR_analysis/URs.txt', header = None)[0]
    i = 0
    logFC_list = list()
    for dataset in Datasets_Inf.columns:
        data = pd.read_table(path_DEGs + dataset)
        if i in (6,12,15,16):
            if i == 15:
                data = data.rename(columns = {'ORF': 'ENTREZ_GENE_ID'})
            trans = translation[translation['Symbol'].isin(URs)][['GeneID','Symbol']]
            data = trans.merge(data, left_on = 'GeneID', right_on ='ENTREZ_GENE_ID')[['Symbol', 'logFC']]
            data = data.rename(columns = {'Symbol': 'Gene.symbol'})
        else:
            if i in (7,20):
                data = data.rename(columns = {'ORF': 'Gene.symbol'})
            if i == 8:
                data = data.rename(columns = {'Gene.Symbol': 'Gene.symbol'})
            if i == 9:
                data = data.rename(columns = {'GENE_SYMBOL': 'Gene.symbol'})
            if i == 13:
                data = data.rename(columns = {'ID': 'Gene.symbol'})
            data = data[data['adj.P.Val'] < 0.05]
            data = data[data['Gene.symbol'].isin(URs)].drop_duplicates('Gene.symbol')[['Gene.symbol', 'logFC']]

        data = data.rename(columns = {'logFC': dataset})
        data.index = data['Gene.symbol']
        data.pop('Gene.symbol')    
        logFC_list.append(data)
        i = i+1

    logFC_Inf = pd.concat(logFC_list, axis = 1)

    i = 0
    logFC_list = list()
    for dataset in Datasets_Noninf.columns:

        data = pd.read_table(path_DEGs + dataset)
        if i == 3:
            data = data.rename(columns = {'ORF': 'Gene.symbol'})
        if i == 8:
            data = data.rename(columns = {'GENE_NAME': 'Gene.symbol'})
        data = data[data['adj.P.Val'] < 0.05]
        data = data[data['Gene.symbol'].isin(URs)].drop_duplicates('Gene.symbol')[['Gene.symbol', 'logFC']]
        data = data.rename(columns = {'logFC': dataset})
        data.index = data['Gene.symbol']
        data.pop('Gene.symbol')
        logFC_list.append(data)
        i = i+1   
    logFC_Noninf = pd.concat(logFC_list, axis = 1)


    Datasets_Noninf_logFC = Datasets_Noninf.copy()
    Datasets_Inf_logFC = Datasets_Inf.copy()

    i = 0
    for dataset in Datasets_Noninf_logFC.columns:
        Datasets_Noninf_logFC.loc[Datasets_Noninf_logFC[dataset]>0,dataset] =\
        logFC_Noninf[Datasets_Noninf_logFC[dataset]>0][dataset]

    for dataset in Datasets_Inf_logFC.columns:
        if i != 12:
            Datasets_Inf_logFC.loc[Datasets_Inf_logFC[dataset]>0,dataset] =\
            logFC_Inf[Datasets_Inf_logFC[dataset]>0][dataset]
        i = i+1



    summary_logFC = pd.concat((Datasets_Inf_logFC, Datasets_Noninf_logFC), axis = 1)
    summary_logFC = summary_logFC.fillna(0)
    summary_logFC = summary_logFC[summary_logFC.sum(axis = 1) != 0]
    summary_logFC = summary_logFC.loc[summary_logFC.index.isin(['AR', 'ESR2', 'FAS', 'IFNG', 'IL1A', 'IL1B', 'TLR3', 'TNF'])]

    return summary_logFC

In [5]:
def zScore_analysis(Datasets_Inf, Datasets_Noninf, path_z_scores):
    ## z score analysis
    z_scores_Inf = list()
    i = 0
    for dataset in Datasets_Inf.columns:
        if i != 12:
            a = pd.read_table(path_z_scores + dataset, sep = ',', index_col = 0)    
            a.columns = [dataset]
            z_scores_Inf.append(a)
        i = i+1
    z_scores_Inf = pd.concat(z_scores_Inf, axis = 1)


    z_scores_Noninf = list()
    for dataset in Datasets_Noninf.columns:
        a = pd.read_table(path_z_scores + dataset, sep = ',', index_col = 0)    
        a.columns = [dataset]
        z_scores_Noninf.append(a)
    z_scores_Noninf = pd.concat(z_scores_Noninf, axis = 1)



    Datasets_Noninf_z_score = Datasets_Noninf.copy()
    Datasets_Inf_z_score = Datasets_Inf.copy()
    i = 0
    for dataset in Datasets_Noninf_z_score.columns:
        Datasets_Noninf_z_score.loc[Datasets_Noninf_z_score[dataset]>0,dataset] =\
        z_scores_Noninf[Datasets_Noninf_z_score[dataset]>0][dataset]

    for dataset in Datasets_Inf_z_score.columns:
        if i != 12:
            Datasets_Inf_z_score.loc[Datasets_Inf_z_score[dataset]>0,dataset] =\
            z_scores_Inf[Datasets_Inf_z_score[dataset]>0][dataset]
        i = i+1

    summary_zScores = pd.concat((Datasets_Inf_z_score, Datasets_Noninf_z_score), axis = 1)
    summary_zScores = summary_zScores.fillna(0)
    summary_zScores = summary_zScores[summary_zScores.sum(axis = 1) != 0]
    summary_zScores = summary_zScores.loc[summary_zScores.index.isin(['AR', 'ESR2', 'FAS', 'IFNG', 'IL1A', 'IL1B', 'TLR3', 'TNF'])]
    
    return summary_zScores

In [6]:
## Input
URs_all_diseases = pd.read_table('../data/UR_analysis/UR_predictions_IMIDs_disease_Pvals.txt', sep = ',')
path_DEGs = '../data/AllDEGfilesMovedToOneFolder/'
path_z_scores = '../data/UR_analysis/z_scores/ '

In [7]:
## Output paths
path_Data_S15 = '../data/UR_analysis/Data S15.xlsx'
path_URs_logFC = '../data/UR_analysis/UR_IMID_summary_logFC.csv'
path_URs_zScore = '../data/UR_analysis/UR_IMID_summary_z.csv'

In [8]:
## Preprocess the data

URs_all_diseases = preprocess_the_input(URs_all_diseases)


In [9]:

Datasets_Inf, Datasets_Noninf = set_up_structure_for_output_files()

In [10]:
## logFC_analysis

summary_logFC = logFC_analysis(Datasets_Inf, Datasets_Noninf, path_DEGs)
#summary_logFC.to_csv(path_URs_logFC)
summary_logFC

  logFC_Noninf[Datasets_Noninf_logFC[dataset]>0][dataset]
  logFC_Inf[Datasets_Inf_logFC[dataset]>0][dataset]


Unnamed: 0_level_0,GSE16161_Skin AD 9 vs contol 9.csv,GSE32924_Skin_AD 13 VS_control8.csv,GSE16879_colon_CD.csv,GSE179285_ascending descending colon_CD.csv,GSE16879_ileum_cd.csv,GSE179285_Terminal ileum_CD.csv,GSE81071_DLE_vs_control.csv,GSE148810_juvenile myositis_skin_1.csv,GSE32591_glomer_vs_contol_LN.csv,GSE181318_skin_psoriatic 3 vs control3.csv,...,GSE112943_subacute cutaneous lupus.csv,GSE32924_uninflamed 12 vs HC 8_AD.csv,GSE179285_unflamed ascending descending colon 72 vs control 12_CD.csv,GSE75214_inactive_vs_normal_CD_16_11_ileum.csv,GSE148810_Nonlesional skin 6 vs HC 8_JM.csv,GSE14905_psoriasis_non lesion skin 28 vs control 21.csv,GSE75214_inactive_vs_normal_23_11_UC_colon.csv,GSE11223_Uninflamed 66 vs HC 69_UC.csv,GSE179285_inactive 32 vs contol 31_UC.csv,GSE66413_Pancreatic lymph nodes 13_ T1D vs healthy 3.csv
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AR,-2.59,-0.895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.14385,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
ESR2,0.335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.085699,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
FAS,0.0729,0.0,0.0,0.0,0.0,0.372,0.535947,0.0,0.459452,0.0,...,0.0,0.0,0.0,0.324532,0.0,0.0,0.0,0,0.0,0.0
IFNG,0.0773,0.0,0.0,0.892278,2.676384,0.0,0.705981,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
IL1A,-0.584,0.0,3.505568,0.586541,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
IL1B,-1.22,0.0,3.769798,2.985478,3.329109,2.75,0.27716,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.187414,0,0.0,0.0
TLR3,0.421,0.0,0.0,0.0,0.0,0.0,0.981494,0.0,0.43672,0.0,...,0.764861,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
TNF,-0.262,0.0,0.973007,1.05167,0.0,0.0,0.0136,0.0,0.0,1.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.232489,0,0.0,0.0


In [11]:
## z score analysis

summary_zScores = zScore_analysis(Datasets_Inf, Datasets_Noninf, path_z_scores)
#summary_zScores.to_csv(path_URs_zScores)
summary_zScores

  z_scores_Noninf[Datasets_Noninf_z_score[dataset]>0][dataset]
  z_scores_Inf[Datasets_Inf_z_score[dataset]>0][dataset]


Unnamed: 0_level_0,GSE16161_Skin AD 9 vs contol 9.csv,GSE32924_Skin_AD 13 VS_control8.csv,GSE16879_colon_CD.csv,GSE179285_ascending descending colon_CD.csv,GSE16879_ileum_cd.csv,GSE179285_Terminal ileum_CD.csv,GSE81071_DLE_vs_control.csv,GSE148810_juvenile myositis_skin_1.csv,GSE32591_glomer_vs_contol_LN.csv,GSE181318_skin_psoriatic 3 vs control3.csv,...,GSE112943_subacute cutaneous lupus.csv,GSE32924_uninflamed 12 vs HC 8_AD.csv,GSE179285_unflamed ascending descending colon 72 vs control 12_CD.csv,GSE75214_inactive_vs_normal_CD_16_11_ileum.csv,GSE148810_Nonlesional skin 6 vs HC 8_JM.csv,GSE14905_psoriasis_non lesion skin 28 vs control 21.csv,GSE75214_inactive_vs_normal_23_11_UC_colon.csv,GSE11223_Uninflamed 66 vs HC 69_UC.csv,GSE179285_inactive 32 vs contol 31_UC.csv,GSE66413_Pancreatic lymph nodes 13_ T1D vs healthy 3.csv
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AR,-0.271,-0.945,1.46,2.88,0.225,2.175,0.0,0.0,-2.266,1.055,...,-2.443,0.0,0.0,2.09,0.0,-1.523,1.736,0,0.0,0.0
ESR2,1.202,0.403,0.01,0.456,0.598,1.959,0.0,0.0,-2.238,2.68,...,0.0,0.418,0.0,0.386,0.0,-2.626,0.58,0,0.0,0.0
FAS,2.429,1.794,0.93,-0.145,1.275,0.488,0.0,1.131,1.756,2.454,...,0.713,0.0,0.0,-0.38,0.0,2.067,-1.587,0,0.0,0.0
IFNG,5.833,5.635,8.45,9.68,5.023,7.89,4.424,7.433,6.974,10.957,...,8.473,0.0,0.0,3.936,0.0,-3.844,4.32,0,0.0,0.0
IL1A,3.339,0.0,5.99,6.807,4.861,4.401,0.0,0.0,-0.303,6.145,...,3.354,0.0,0.0,3.993,0.0,-5.195,4.622,0,2.791,0.0
IL1B,2.448,0.0,7.397,7.352,5.1,5.348,1.997,2.411,0.861,8.413,...,0.0,0.0,0.0,4.462,0.0,0.0,5.109,0,2.398,0.0
TLR3,3.498,0.0,3.326,3.474,1.709,0.0,0.0,2.219,2.119,5.66,...,2.227,0.0,0.0,0.0,0.0,0.0,3.805,0,0.0,0.0
TNF,4.915,5.59,8.744,9.632,7.611,6.497,2.89,3.874,1.524,11.197,...,6.435,0.0,0.0,4.838,0.0,-6.93,3.753,0,2.852,0.0
