In [1]:
import gzip
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
variant_disease = pd.read_csv("../SAR/all_variant_disease_associations.tsv.gz", compression='gzip', sep = '\t')

In [3]:
# Find GWAS genes in the database
RA = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'rheumatoid' in variant_disease['diseaseName'][i]:
        RA.append(i)
    if 'Rheumatoid' in variant_disease['diseaseName'][i]:
        RA.append(i)

    

In [4]:
np.unique(variant_disease['diseaseName'][RA])

array(['Early Rheumatoid Arthritis', 'Juvenile rheumatoid arthritis',
       'Polyarticular Juvenile Idiopathic Arthritis, Rheumatoid Factor Negative',
       'Progression of rheumatoid arthritis',
       'Progressive pseudorheumatoid dysplasia', 'Rheumatoid Arthritis',
       'Rheumatoid Arthritis, Systemic Juvenile',
       'Rheumatoid Factor Measurement', 'Rheumatoid Nodule',
       'Seropositive rheumatoid arthritis'], dtype=object)

In [5]:
# Let's say that we don't want Juvenile rheumatoid arthritis, we can get rid of it for example the following way
RA = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Rheumatoid Arthritis' in variant_disease['diseaseName'][i]\
    and 'Juvenile' not in variant_disease['diseaseName'][i]\
    and 'Early' not in variant_disease['diseaseName'][i]:
        RA.append(i)

In [6]:
#Now we are satisfied with the diseases
np.unique(variant_disease['diseaseName'][RA])

array(['Rheumatoid Arthritis'], dtype=object)

In [7]:
#Map the database to the geneSymbol names
snpId = variant_disease.loc[RA]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)


In [8]:
# Here is the list of GWAS genes that we are interested in
GWAS_RA = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_RA)

777

In [9]:
GWAS_RA

array(['AARS2', 'ABCB1', 'ABCC4', 'ABCF1', 'ABCG2', 'ACOXL', 'ACR',
       'ACTA2', 'ADAD1', 'ADAMTS12', 'ADH7', 'ADIPOQ', 'ADIPOQ-AS1',
       'ADIPOR1', 'ADIPOR2', 'ADORA2A', 'ADORA2A-AS1', 'AFF1', 'AFF3',
       'AGBL2', 'AGER', 'AGPAT1', 'AHR', 'AHRR', 'AHSA1', 'AIF1', 'AIFM2',
       'AIRE', 'ALPK1', 'AMPD1', 'ANGPT2', 'ANKRD55', 'ANO8', 'ANXA3',
       'AP4B1-AS1', 'APOC1', 'APOM', 'ARHGEF7', 'ARID5B', 'ARL15',
       'ARPC1A', 'ART3', 'ASB15', 'ASB15-AS1', 'ASCC1', 'ATAT1', 'ATF6B',
       'ATG5', 'ATIC', 'ATN1', 'ATP6V1G2', 'ATP6V1G2-DDX39B', 'B9D2',
       'BACH2', 'BAG6', 'BANK1', 'BCL2', 'BCL2L15', 'BLK', 'BLOC1S5',
       'BLOC1S5-TXNDC5', 'BRPF1', 'BTNL2', 'C1orf141', 'C2', 'C2-AS1',
       'C3orf67', 'C3orf67-AS1', 'C5', 'C5-OT1', 'C6orf15', 'C6orf47',
       'CA1', 'CAMK1', 'CARD8', 'CARD9', 'CASC15', 'CASP10', 'CASP5',
       'CASP7', 'CASTOR1', 'CCDC116', 'CCHCR1', 'CCL21', 'CCL27', 'CCL4',
       'CCN4', 'CCNG2', 'CCR3', 'CCR6', 'CD160', 'CD226', 'CD244',
       'CD24

In [10]:
# Find GWAS genes in the database
CD = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Crohn' in variant_disease['diseaseName'][i]:
        CD.append(i)
    if 'crohn' in variant_disease['diseaseName'][i]:
        CD.append(i)

In [11]:
np.unique(variant_disease['diseaseName'][CD])

array(['Crohn Disease', "Crohn's disease of large bowel",
       "Gastrointestinal Crohn's disease", "Pediatric Crohn's disease",
       "Perianal Crohn's disease", "Ulcerative colitis or Crohn's"],
      dtype=object)

In [12]:
CD = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Crohn Disease' in variant_disease['diseaseName'][i]:
        CD.append(i)


In [13]:
np.unique(variant_disease['diseaseName'][CD])

array(['Crohn Disease'], dtype=object)

In [14]:
#Map the database to the geneSymbol names
snpId = variant_disease.loc[CD]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [15]:
# Here is the list of GWAS genes that we are interested in
GWAS_CD = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_CD)

515

In [16]:
# Find GWAS genes in the database
DLE = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'lupus' in variant_disease['diseaseName'][i]:
        DLE.append(i)
    if 'Lupus' in variant_disease['diseaseName'][i]:
        DLE.append(i)

In [17]:
np.unique(variant_disease['diseaseName'][DLE])

array(['Chilblain lupus 1', 'Chilblain lupus erythematosus',
       'Lupus Erythematosus', 'Lupus Erythematosus, Cutaneous',
       'Lupus Erythematosus, Discoid',
       'Lupus Erythematosus, Subacute Cutaneous',
       'Lupus Erythematosus, Systemic', 'Lupus Nephritis',
       'Lupus Vulgaris', 'Lupus anticoagulant disorder',
       'Neonatal Systemic lupus erythematosus',
       'Neuropsychiatric Systemic Lupus Erythematosus', 'lupus cutaneous'],
      dtype=object)

In [18]:
DLE = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Discoid' in variant_disease['diseaseName'][i]:
        DLE.append(i)
    if 'discoid' in variant_disease['diseaseName'][i]:
        DLE.append(i)


In [19]:
np.unique(variant_disease['diseaseName'][DLE])

array(['Lupus Erythematosus, Discoid'], dtype=object)

In [20]:
#Map the database to the geneSymbol names
snpId = variant_disease.loc[DLE]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [21]:
# Here is the list of GWAS genes that we are interested in
GWAS_DLE = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_DLE)

40

In [22]:
SjS = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Sjogren' in variant_disease['diseaseName'][i]:
        SjS.append(i)
    if 'sjogren' in variant_disease['diseaseName'][i]:
        SjS.append(i)
    if 'sjögren' in variant_disease['diseaseName'][i]:
        SjS.append(i)
    if 'Sjögren' in variant_disease['diseaseName'][i]:
        SjS.append(i)

In [23]:
np.unique(variant_disease['diseaseName'][SjS])

array(['Marinesco-Sjogren syndrome', "Primary Sjögren's syndrome",
       "Sjogren's Syndrome", 'Sjogren-Larsson Syndrome'], dtype=object)

In [24]:
SjS = list()
for i in range(len(variant_disease['diseaseName'])):
    if "Sjogren's" in variant_disease['diseaseName'][i]:
        SjS.append(i)
    if "Sjögren's" in variant_disease['diseaseName'][i]:
        SjS.append(i)


In [25]:
np.unique(variant_disease['diseaseName'][SjS])

array(["Primary Sjögren's syndrome", "Sjogren's Syndrome"], dtype=object)

In [26]:
#Map the database to the geneSymbol names
snpId = variant_disease.loc[SjS]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [27]:
# Here is the list of GWAS genes that we are interested in
GWAS_SjS = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_SjS)

49

In [28]:
Diabetes = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Diabetes' in variant_disease['diseaseName'][i]:
        Diabetes.append(i)
    if 'diabetes' in variant_disease['diseaseName'][i]:
        Diabetes.append(i)


In [29]:
np.unique(variant_disease['diseaseName'][Diabetes])

array(['Autosomal Dominant Neurohypophyseal Diabetes Insipidus',
       'Central Diabetes Insipidus', 'Complications of Diabetes Mellitus',
       'Congenital Nephrogenic Diabetes Insipidus',
       'Developmental Delay, Epilepsy, and Neonatal Diabetes', 'Diabetes',
       'Diabetes Insipidus', 'Diabetes Mellitus',
       'Diabetes Mellitus, Insulin-Dependent',
       'Diabetes Mellitus, Insulin-Resistant, with Acanthosis Nigricans',
       'Diabetes Mellitus, Ketosis-Prone',
       'Diabetes Mellitus, Neonatal, with Congenital Hypothyroidism',
       'Diabetes Mellitus, Non-Insulin-Dependent',
       'Diabetes Mellitus, Transient Neonatal, 1', 'Diabetes in children',
       'Diabetes in youth',
       'Diabetes mellitus autosomal dominant type II (disorder)',
       'Diabetes, Autoimmune',
       'Diabetes-deafness syndrome maternally transmitted (disorder)',
       'Fibrocalculous pancreatic diabetes', 'Gestational Diabetes',
       'Insulin-dependent diabetes mellitus secretory diar

In [30]:
Diabetes = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Diabetes Mellitus, Insulin-Dependent' in variant_disease['diseaseName'][i]:
        Diabetes.append(i)
    if 'Diabetes Mellitus, Ketosis-Prone' in variant_disease['diseaseName'][i]:
        Diabetes.append(i)
    if 'Diabetes, Autoimmune' in variant_disease['diseaseName'][i]:
        Diabetes.append(i)
    if 'Neonatal insulin-dependent diabetes mellitus' in variant_disease['diseaseName'][i]:
        Diabetes.append(i)

In [31]:
np.unique(variant_disease['diseaseName'][Diabetes])

array(['Diabetes Mellitus, Insulin-Dependent',
       'Diabetes Mellitus, Ketosis-Prone', 'Diabetes, Autoimmune',
       'Neonatal insulin-dependent diabetes mellitus'], dtype=object)

In [32]:
#Map the database to the geneSymbol names
snpId = variant_disease.loc[Diabetes]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [33]:
# Here is the list of GWAS genes that we are interested in
GWAS_Diabetes = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_Diabetes)

485

In [34]:
AD = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Atopic' in variant_disease['diseaseName'][i]:
        AD.append(i)
    if 'atopic' in variant_disease['diseaseName'][i]:
        AD.append(i)


In [35]:
np.unique(variant_disease['diseaseName'][AD])

array(['Adult atopic dermatitis', 'Atopic IgE-mediated allergic disorder',
       'Atopic disorders', 'Dermatitis, Atopic', 'Dermatitis, Atopic, 2'],
      dtype=object)

In [36]:
AD = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Dermatitis, Atopic' in variant_disease['diseaseName'][i]:
        AD.append(i)


In [37]:
np.unique(variant_disease['diseaseName'][AD])

array(['Dermatitis, Atopic', 'Dermatitis, Atopic, 2'], dtype=object)

In [38]:
snpId = variant_disease.loc[AD]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [39]:
# Here is the list of GWAS genes that we are interested in
GWAS_AD = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_AD)

143

In [40]:
COL = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Lupus' in variant_disease['diseaseName'][i]:
        COL.append(i)
    if 'lupus' in variant_disease['diseaseName'][i]:
        COL.append(i)

In [41]:
np.unique(variant_disease['diseaseName'][COL])

array(['Chilblain lupus 1', 'Chilblain lupus erythematosus',
       'Lupus Erythematosus', 'Lupus Erythematosus, Cutaneous',
       'Lupus Erythematosus, Discoid',
       'Lupus Erythematosus, Subacute Cutaneous',
       'Lupus Erythematosus, Systemic', 'Lupus Nephritis',
       'Lupus Vulgaris', 'Lupus anticoagulant disorder',
       'Neonatal Systemic lupus erythematosus',
       'Neuropsychiatric Systemic Lupus Erythematosus', 'lupus cutaneous'],
      dtype=object)

In [42]:
COL = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Neonatal Systemic lupus erythematosus' in variant_disease['diseaseName'][i]:
        COL.append(i)
    if 'Neuropsychiatric Systemic Lupus Erythematosus' in variant_disease['diseaseName'][i]:
        COL.append(i)

In [43]:
np.unique(variant_disease['diseaseName'][COL])

array(['Neonatal Systemic lupus erythematosus',
       'Neuropsychiatric Systemic Lupus Erythematosus'], dtype=object)

In [44]:
snpId = variant_disease.loc[COL]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [45]:
# Here is the list of GWAS genes that we are interested in
GWAS_COL = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_COL)

3

In [46]:
JM = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Myositis' in variant_disease['diseaseName'][i]:
        JM.append(i)
    if 'myositis' in variant_disease['diseaseName'][i]:
        JM.append(i)

In [47]:
np.unique(variant_disease['diseaseName'][JM])

array(['Adult type dermatomyositis', 'Dermatomyositis',
       'Dermatomyositis, Childhood Type',
       'Eosinophilic myositis (disorder)',
       'Inclusion Body Myositis (disorder)', 'Myositis', 'Polymyositis'],
      dtype=object)

In [48]:
JM = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Adult type dermatomyositis' in variant_disease['diseaseName'][i]:
        JM.append(i)
    if 'Dermatomyositis' in variant_disease['diseaseName'][i]:
        JM.append(i)
    if 'Myositis' in variant_disease['diseaseName'][i]\
    and 'disorder' not in variant_disease['diseaseName'][i]:
        JM.append(i)

In [49]:
np.unique(variant_disease['diseaseName'][JM])

array(['Adult type dermatomyositis', 'Dermatomyositis',
       'Dermatomyositis, Childhood Type', 'Myositis'], dtype=object)

In [50]:
snpId = variant_disease.loc[JM]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [51]:
# Here is the list of GWAS genes that we are interested in
GWAS_JM = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_JM)

46

In [52]:
lupus = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Lupus' in variant_disease['diseaseName'][i]:
        lupus.append(i)
    if 'lupus' in variant_disease['diseaseName'][i]:
        lupus.append(i)

In [53]:
np.unique(variant_disease['diseaseName'][lupus])

array(['Chilblain lupus 1', 'Chilblain lupus erythematosus',
       'Lupus Erythematosus', 'Lupus Erythematosus, Cutaneous',
       'Lupus Erythematosus, Discoid',
       'Lupus Erythematosus, Subacute Cutaneous',
       'Lupus Erythematosus, Systemic', 'Lupus Nephritis',
       'Lupus Vulgaris', 'Lupus anticoagulant disorder',
       'Neonatal Systemic lupus erythematosus',
       'Neuropsychiatric Systemic Lupus Erythematosus', 'lupus cutaneous'],
      dtype=object)

In [54]:
lupus = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Lupus Erythematosus' in variant_disease['diseaseName'][i]:
        lupus.append(i)


In [55]:
np.unique(variant_disease['diseaseName'][lupus])

array(['Lupus Erythematosus', 'Lupus Erythematosus, Cutaneous',
       'Lupus Erythematosus, Discoid',
       'Lupus Erythematosus, Subacute Cutaneous',
       'Lupus Erythematosus, Systemic',
       'Neuropsychiatric Systemic Lupus Erythematosus'], dtype=object)

In [56]:
snpId = variant_disease.loc[lupus]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [57]:
# Here is the list of GWAS genes that we are interested in
GWAS_lupus = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_lupus)

626

In [58]:
LN = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Lupus' in variant_disease['diseaseName'][i]:
        LN.append(i)
    if 'lupus' in variant_disease['diseaseName'][i]:
        LN.append(i)

In [59]:
np.unique(variant_disease['diseaseName'][LN])

array(['Chilblain lupus 1', 'Chilblain lupus erythematosus',
       'Lupus Erythematosus', 'Lupus Erythematosus, Cutaneous',
       'Lupus Erythematosus, Discoid',
       'Lupus Erythematosus, Subacute Cutaneous',
       'Lupus Erythematosus, Systemic', 'Lupus Nephritis',
       'Lupus Vulgaris', 'Lupus anticoagulant disorder',
       'Neonatal Systemic lupus erythematosus',
       'Neuropsychiatric Systemic Lupus Erythematosus', 'lupus cutaneous'],
      dtype=object)

In [60]:
LN = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Lupus Nephritis' in variant_disease['diseaseName'][i]:
        LN.append(i)


In [61]:
np.unique(variant_disease['diseaseName'][LN])

array(['Lupus Nephritis'], dtype=object)

In [62]:
snpId = variant_disease.loc[LN]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [63]:
# Here is the list of GWAS genes that we are interested in
GWAS_LN = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_LN)

53

In [64]:
P = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Psoriasis' in variant_disease['diseaseName'][i]:
        P.append(i)
    if 'psoriasis' in variant_disease['diseaseName'][i]:
        P.append(i)

In [65]:
np.unique(variant_disease['diseaseName'][P])

array(['Chronic small plaque psoriasis',
       'Chronic stable plaque psoriasis', 'Familial psoriasis',
       'Generalized pustular psoriasis', 'Guttate psoriasis', 'Psoriasis',
       'Psoriasis vulgaris', 'Pustular psoriasis'], dtype=object)

In [66]:
P = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Psoriasis' in variant_disease['diseaseName'][i]\
    and 'vulgaris' not in variant_disease['diseaseName'][i]:
        P.append(i)


In [67]:
np.unique(variant_disease['diseaseName'][P])

array(['Psoriasis'], dtype=object)

In [68]:
snpId = variant_disease.loc[P]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [69]:
# Here is the list of GWAS genes that we are interested in
GWAS_P = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_P)

416

In [70]:
SCLE = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'lupus' in variant_disease['diseaseName'][i]:
        SCLE.append(i)
    if 'Lupus' in variant_disease['diseaseName'][i]:
        SCLE.append(i)

In [71]:
np.unique(variant_disease['diseaseName'][SCLE])

array(['Chilblain lupus 1', 'Chilblain lupus erythematosus',
       'Lupus Erythematosus', 'Lupus Erythematosus, Cutaneous',
       'Lupus Erythematosus, Discoid',
       'Lupus Erythematosus, Subacute Cutaneous',
       'Lupus Erythematosus, Systemic', 'Lupus Nephritis',
       'Lupus Vulgaris', 'Lupus anticoagulant disorder',
       'Neonatal Systemic lupus erythematosus',
       'Neuropsychiatric Systemic Lupus Erythematosus', 'lupus cutaneous'],
      dtype=object)

In [72]:
SCLE = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Lupus Erythematosus, Subacute Cutaneous' in variant_disease['diseaseName'][i]:
        SCLE.append(i)


In [73]:
np.unique(variant_disease['diseaseName'][SCLE])

array(['Lupus Erythematosus, Subacute Cutaneous'], dtype=object)

In [74]:
snpId = variant_disease.loc[SCLE]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [75]:
# Here is the list of GWAS genes that we are interested in
GWAS_SCLE = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_SCLE)

1

In [76]:
SyS = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'scleroderma' in variant_disease['diseaseName'][i]:
        SyS.append(i)
    if 'Scleroderma' in variant_disease['diseaseName'][i]:
        SyS.append(i)

In [77]:
np.unique(variant_disease['diseaseName'][SyS])

array(['Diffuse Scleroderma', 'Scleroderma', 'Scleroderma, Limited',
       'Systemic Scleroderma'], dtype=object)

In [78]:
SyS = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Systemic Scleroderma' in variant_disease['diseaseName'][i]:
        SyS.append(i)

In [79]:
np.unique(variant_disease['diseaseName'][SyS])

array(['Systemic Scleroderma'], dtype=object)

In [80]:
snpId = variant_disease.loc[SyS]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [81]:
# Here is the list of GWAS genes that we are interested in
GWAS_SyS = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_SyS)

171

In [82]:
UC = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Ulcerative' in variant_disease['diseaseName'][i]:
        UC.append(i)
    if 'ulcerative' in variant_disease['diseaseName'][i]:
        UC.append(i)

In [83]:
np.unique(variant_disease['diseaseName'][UC])

array(['Ulcerative Colitis', "Ulcerative colitis or Crohn's"],
      dtype=object)

In [84]:
UC = list()
for i in range(len(variant_disease['diseaseName'])):
    if 'Ulcerative Colitis' in variant_disease['diseaseName'][i]:
        UC.append(i)


In [85]:
np.unique(variant_disease['diseaseName'][UC])

array(['Ulcerative Colitis'], dtype=object)

In [86]:
snpId = variant_disease.loc[UC]['snpId']
variant_gene = pd.read_csv("../SAR/variant_to_gene_mappings.tsv.gz", compression='gzip', sep = '\t')
chosen = list()
for i in range(len(variant_gene['snpId'])):
    if variant_gene['snpId'][i] in np.array(snpId):
        chosen.append(i)

In [87]:
# Here is the list of GWAS genes that we are interested in
GWAS_UC = np.unique(variant_gene.loc[chosen]['geneSymbol'])
len(GWAS_UC)

452

In [88]:
GWAS_list = list((GWAS_UC, GWAS_UC, GWAS_UC, GWAS_lupus, GWAS_SCLE, GWAS_COL, GWAS_JM, GWAS_JM, GWAS_P, GWAS_AD,\
                  GWAS_CD, GWAS_CD, GWAS_SjS, GWAS_CD, GWAS_CD, GWAS_CD, GWAS_UC, GWAS_UC, GWAS_P, GWAS_RA,\
                  GWAS_LN, GWAS_AD, GWAS_AD, GWAS_SjS, GWAS_RA, GWAS_Diabetes, GWAS_CD, GWAS_UC, GWAS_DLE,\
                  GWAS_SCLE, GWAS_SyS, GWAS_SyS))

In [89]:
GWAS_list = list((GWAS_UC, GWAS_UC, GWAS_UC, GWAS_lupus, GWAS_lupus, GWAS_lupus, GWAS_JM, GWAS_JM, GWAS_P, GWAS_AD,\
                  GWAS_CD, GWAS_CD, GWAS_SjS, GWAS_CD, GWAS_CD, GWAS_CD, GWAS_UC, GWAS_UC, GWAS_P, GWAS_RA,\
                  GWAS_LN, GWAS_AD, GWAS_AD, GWAS_SjS, GWAS_RA, GWAS_Diabetes, GWAS_CD, GWAS_UC, GWAS_lupus,\
                  GWAS_lupus, GWAS_SyS, GWAS_SyS))

In [109]:
datasets

['GSE11223_UC_colon_inactive.csv',
 'GSE11223_UC_descending_colon.csv',
 'GSE11223_UC_sigmoid_colon.csv',
 'GSE112943_lupus_kidney.csv',
 'GSE112943_lupus_subacute_cutaneous.csv',
 'GSE148810_cSLE_skin.csv',
 'GSE148810_JM_skin.csv',
 'GSE148810_JM_skin_inactive.csv',
 'GSE14905_PSO_skin_inactive.csv',
 'GSE16161_AD_skin.csv',
 'GSE16879_CD_colon.csv',
 'GSE16879_CD_ileum.csv',
 'GSE176510_SS_kerato_conjunctivitis.csv',
 'GSE179285_CD_ascending_colon.csv',
 'GSE179285_CD_ascending_descending_colon_inactive.csv',
 'GSE179285_CD_terminal_ileum.csv',
 'GSE179285_UC_ascending_descending_colon_inactive.csv',
 'GSE179285_UC_sigmoid_colon.csv',
 'GSE181318_PSO_skin.csv',
 'GSE1919_RA_synovium.csv',
 'GSE32591_LN_glomeruli_compartment.csv',
 'GSE32924_AD_skin.csv',
 'GSE32924_AD_skin_inactive.csv',
 'GSE40568_SS_salivary_gland.csv',
 'GSE55235_RA_synovium.csv',
 'GSE66413_at_risk_T1D_pancreatic_lymph_nodes.csv',
 'GSE75214_CD_ileum_inactive.csv',
 'GSE75214_UC_colon_inactive.csv',
 'GSE81071_D

In [90]:
datasets = !ls background_genes_for_Martin/

In [103]:
#pathways_all = pd.read_csv('TreeStructure_nodes2_AID_noblood_all_pathways.txt', sep = '\t') 
pathways_all = pd.read_csv('PathInfo_AID_noblood.csv') 

#pathways_cluster1 = pd.read_csv('TreeStructure_nodes2_AID_noblood.txt', sep = '\t') 
pathways_cluster1 = pd.read_csv('TreeStructure_nodes2_CLUSTER2_AID_noblood.txt', sep = '\t') 


In [95]:



"""Cluster 1"""
pathways = pathways_all.loc[np.where(pathways_all['IngenuityCanonicalPathways'].isin\
                          (pathways_cluster1['IngenuityCanonicalPathways']))[0]]
"""Cluster 2"""
pathways = pathways_all.loc[np.where(~pathways_all['IngenuityCanonicalPathways'].isin\
                          (pathways_cluster1['IngenuityCanonicalPathways']))[0]]


pathways.index = np.array(range(len(pathways.index)))

odds_list = list()
pval_list = list()
names = list()
DEGs_count = list()
for i in range(len(datasets)):
        
    genes = pd.read_csv('background_genes_for_Martin/' + str(datasets[i]))
    background_genes = np.array(genes.loc[np.where(genes['isDE'] == 1)[0]]['AllMeasuredGenes'])
    
    
    DEGs = list()
    for k in range(len(pathways.index)): 
        p = pathways.loc[k]
        DEGs.append(str(p['AllMolecules']).split(','))
    DEGs = np.concatenate(DEGs)
    DEGs = np.unique(DEGs)
    


    background_genes = np.unique(background_genes)
    DEGs = np.intersect1d(DEGs, background_genes)
    GWAS = np.unique(GWAS_list[i])
    
    #print(len(np.intersect1d(GWAS, background_genes)))
    #print(len(background_genes))
    nonDEGs = list(set(background_genes) - set(DEGs))                
    table11 = len(np.intersect1d(GWAS, DEGs))
    table12 = len(DEGs) - table11
    table21 = len(np.intersect1d(GWAS, nonDEGs))
    table22 = len(background_genes) - (table21 + table12 + table11)
    table = pd.DataFrame([[table11, table12], [table21, table22]])
    table.index = ('DEGs', 'nonDEGs')
    table.columns = ('GWAS', 'nonGWAS')
    #print(datasets[i].split('.')[0])
    #print(table)
    odds, pval = stats.fisher_exact(table, alternative = 'greater')
    odds_list.append(odds)
    pval_list.append(pval)
    names.append(datasets[i].split('.')[0])
    DEGs_count.append(int(len(background_genes)))
    

In [96]:
GWAS_summary = pd.DataFrame((odds_list, pval_list, DEGs_count), columns = names, index = ('odds', 'pval', 'background_genes_count')).transpose()

In [97]:
GWAS_summary.sort_values('pval')

Unnamed: 0,odds,pval,background_genes_count
GSE112943_lupus_kidney,3.231743,1.171217e-22,11467.0
GSE112943_lupus_subacute_cutaneous,3.238647,1.233324e-17,7529.0
GSE16879_CD_colon,3.762721,1.663461e-14,3864.0
GSE179285_UC_sigmoid_colon,3.462475,2.434899e-14,6901.0
GSE179285_CD_ascending_colon,3.093969,2.791519e-13,5829.0
GSE55235_RA_synovium,3.583401,3.846921e-12,2282.0
GSE179285_CD_terminal_ileum,4.050067,2.878465e-11,3358.0
GSE11223_UC_sigmoid_colon,3.345693,5.721294e-11,6229.0
GSE148810_cSLE_skin,7.222222,2.209685e-10,584.0
GSE1919_RA_synovium,7.05726,5.69736e-07,406.0


In [114]:
#GWAS_summary.to_csv('GWAS_summary_AID_cluster1_oneSided.csv', index = True, header = True)

In [104]:
odds_list2 = list()
pval_list2 = list()
names = list()
for i in range(len(datasets)):

    genes = pd.read_csv('background_genes_for_Martin/' + str(datasets[i]))
    background_genes = np.array(genes.loc[np.where(genes['isDE'] == 1)[0]]['AllMeasuredGenes'])

    odds_list = list()
    pval_list = list()
    
    for j in np.unique(pathways_cluster1['subclusters']):
        pathways_subset = pathways_cluster1.loc[np.where(pathways_cluster1['subclusters'] == j)[0]]
        pathways_subset.index = np.array(range(len(pathways_subset.index)))
        DEGs = list()
        for k in range(len(pathways_subset.index)): 
            p = pathways_subset.loc[k]
            DEGs.append(str(p['AllMolecules']).split(','))
        DEGs = np.concatenate(DEGs)
        DEGs = np.unique(DEGs)
        
        background_genes = np.unique(background_genes)
        DEGs = np.intersect1d(DEGs, background_genes)
    
        GWAS = np.unique(GWAS_list[i])
        nonDEGs = list(set(background_genes) - set(DEGs))                
        table11 = len(np.intersect1d(GWAS, DEGs))
        table12 = len(DEGs) - table11
        table21 = len(np.intersect1d(GWAS, nonDEGs))
        table22 = len(background_genes) - (table21 + table12 + table11)
        table = pd.DataFrame([[table11, table12], [table21, table22]])
        table.index = ('DEGs', 'nonDEGs')
        table.columns = ('GWAS', 'nonGWAS')
        odds, pval = stats.fisher_exact(table, alternative = 'greater')
        odds_list.append(odds)
        pval_list.append(pval)
    odds_list2.append(odds_list)
    pval_list2.append(pval_list)
    names.append(datasets[i].split('.')[0])


# if the odds are > 1 and pval <0.05 The list of DEGs is significantly enriched by the GWAS genes
# if the odds are < 1 and pval <0.05 The list of DEGs is not enriched by the GWAS genes

In [105]:
np.unique(pathways_cluster1['subclusters'])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

In [107]:
Odds = pd.DataFrame(odds_list2, index = names, columns = np.array(range(13)) + 1)
Odds

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
GSE11223_UC_colon_inactive,0.0,inf,0.0,,0.0,,0.0,,0.0,,,,
GSE11223_UC_descending_colon,0.0,1.508948,1.666102,0.0,3.633244,9.70398,7.894525,9.841975,3.324037,0.0,0.0,0.0,8.199588
GSE11223_UC_sigmoid_colon,0.0,1.795671,0.573238,0.0,3.755394,7.970628,7.580808,8.932551,2.991453,0.0,1.51775,1.386971,9.827419
GSE112943_lupus_kidney,0.0,1.763782,0.495177,1.229553,3.449156,8.642683,3.921458,12.91744,3.175739,0.0,1.661706,0.0,7.166559
GSE112943_lupus_subacute_cutaneous,0.0,1.566637,1.329444,1.513412,3.151134,8.299694,4.209498,11.726468,2.797442,0.0,2.056593,0.0,7.994518
GSE148810_cSLE_skin,0.0,3.445946,1.023529,,3.512315,7.149362,1.489796,2.645217,2.899814,0.0,0.0,0.0,
GSE148810_JM_skin,,0.0,0.0,,inf,0.0,0.0,0.0,0.0,,,,
GSE148810_JM_skin_inactive,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
GSE14905_PSO_skin_inactive,0.0,1.847074,0.717002,0.0,2.926456,8.861237,2.087454,6.728198,3.45608,0.0,4.890063,0.0,10.772093
GSE16161_AD_skin,0.0,0.827131,0.0,0.0,2.059069,5.237949,4.246048,35.62706,3.944157,0.0,3.040014,0.0,10.327957


In [None]:
#Odds.to_csv('Odds_summary_AID_cluster1_oneSided.csv', index = True, header = True)

In [108]:
Pval = pd.DataFrame(pval_list2, index = names, columns = np.array(range(13)) + 1)
Pval

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
GSE11223_UC_colon_inactive,1.0,0.153846,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
GSE11223_UC_descending_colon,1.0,0.122388,0.348895,1.0,1.960427e-07,1.284692e-09,2.925774e-06,0.114943,0.008260076,1.0,1.0,1.0,0.132788
GSE11223_UC_sigmoid_colon,1.0,0.011893,0.826043,1.0,1.501099e-10,1.271943e-10,4.683024e-08,0.02737,0.005338156,1.0,0.491415,0.52182,0.023466
GSE112943_lupus_kidney,1.0,0.000486,0.909743,0.564527,1.9722029999999997e-19,1.668346e-24,7.292251e-06,0.000125,1.412339e-05,1.0,0.348848,1.0,0.04092
GSE112943_lupus_subacute_cutaneous,1.0,0.012717,0.334843,0.495161,1.968367e-13,9.589318999999999e-20,1.28757e-05,0.000888,0.0005405892,1.0,0.26722,1.0,0.035572
GSE148810_cSLE_skin,1.0,0.000526,0.644829,1.0,4.905888e-05,1.247308e-09,0.3621798,0.048012,0.1202333,1.0,1.0,1.0,1.0
GSE148810_JM_skin,1.0,1.0,1.0,1.0,0.1374705,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
GSE148810_JM_skin_inactive,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
GSE14905_PSO_skin_inactive,1.0,0.029836,0.754697,1.0,1.899562e-05,1.089947e-09,0.1867286,0.154171,0.003952115,1.0,0.200143,1.0,0.105589
GSE16161_AD_skin,1.0,0.729422,1.0,1.0,0.008739057,0.000273558,0.008329304,0.000139,0.003013417,1.0,0.285679,1.0,0.097845


In [224]:
#Pval.to_csv('Pval_summary_AID_cluster1_oneSided.csv', index = True, header = True)

In [110]:
table = pd.DataFrame([[10, 5], [20, 40]])

stats.fisher_exact(table, alternative = 'greater')

(4.0, 0.02026285317016011)

In [111]:
table

Unnamed: 0,0,1
0,10,5
1,20,40


In [118]:
table = pd.DataFrame([[20, 10], [40, 80]])

stats.fisher_exact(table, alternative = 'greater')

(4.0, 0.000960157963979419)

In [119]:
table

Unnamed: 0,0,1
0,20,10
1,40,80
