In [1]:
import pandas as pd
import numpy as np
from pronto import Ontology

## Create a table with Disease concept ID, disease ontology ID, and names

In [2]:
# open human disease ontology obo file
do = Ontology('../data/HumanDO.obo')

In [3]:
# open CREEDS disease signature file
creeds = pd.read_csv(../data/Diseases/disease_signatures-v1.0.csv',
                    sep=',', header=0, index_col=0)

In [4]:
creeds.head(5)

Unnamed: 0_level_0,cell_type,ctrl_ids,curator,disease_name,do_id,geo_id,organism,pert_ids,platform,umls_cui,version
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
dz:328,Muscle - Striated (Skeletal) (MMHCC),GSM4372|GSM4373|GSM4374|GSM4375|GSM4376,Joel.Dudley,Duchenne muscular dystrophy,DOID:11723,GSE466,mouse,GSM4377|GSM4378|GSM4379|GSM4380|GSM4381,GPL81,C0013264,1.0
dz:325,Ganglioneuroblastoma,GSM282582|GSM282583|GSM282587|GSM282588|GSM282589,Joel.Dudley,Nicotine addiction,,GSE11208,human,GSM282584|GSM282585|GSM282586|GSM282590|GSM282...,GPL570,C0028043,1.0
dz:324,Nose,GSM286649|GSM286650|GSM286651|GSM286655|GSM286...,Joel.Dudley,Rhinovirus infection,,GSE11348,human,GSM286646|GSM286647|GSM286648|GSM286652|GSM286...,GPL570,C0276447,1.0
dz:323,Myocardial tissue,GSM82393|GSM82394|GSM82395|GSM82396|GSM82397|G...,Joel.Dudley,dilated cardiomyopathy,DOID:12930,GSE3586,human,GSM82408|GSM82409|GSM82410|GSM82411|GSM82412|G...,GPL3050,C0007193,1.0
dz:322,Peripheral blood mononuclear cell,GSM66671|GSM66672|GSM66673|GSM66674|GSM66675|G...,Joel.Dudley,bacterial infectious disease,DOID:104,GSE3026,human,GSM66617|GSM66618|GSM66619|GSM66620|GSM66621|G...,GPL8300,C0004623,1.0


### creating the function to get the related diseases from Disease ontology 

In [5]:
# get disease parent function
def get_parent_ids(term_name):
    term = do[term_name]
    ancestors = list(term.superclasses())
    ancestor_ids = []
    for a in ancestors:
        ancestor_ids.append(a.id)
    return ancestor_ids

In [6]:
# get the concept id of the disease function
def get_concept_ids(term_name):
    term = do[term_name]
    concept_ids = []
    for xref in list(term.xrefs):
        if 'UMLS_CUI' in str(xref):
            ugly = str(xref).split(':')[1]
            ugly = ugly[:-2]
            concept_ids.append(ugly)
    return concept_ids

In [7]:
# get the disease name function
def get_disease_name(term_name):
    term = do[term_name]
    disease = term.name
    return disease

In [8]:
# get the off-spring of the disease function
def get_offSpring_ids(term_name):
    term = do[term_name]
    offSprings = list(term.subclasses())
    offSpring_ids = []
    for a in offSprings:
        offSpring_ids.append(a.id)
    return offSpring_ids

In [9]:
# remove null data
DOID =creeds['do_id']
fil = DOID.notna()
DOID = DOID[fil]

### create a dictionary from CREEDS data (DOIDs and dz-ids)

In [10]:
doid_dic = DOID.to_dict()

In [11]:
dz_dic = {v: k for k, v in doid_dic.items()}

### creating a table for each group (disease parents, offsprings) with Identifiers and names

### Offsprings

In [12]:
off_results = pd.DataFrame()
for ID in DOID:
    table = pd.DataFrame()
    conc = []
    doid = []
    con = []
    dz_id = []
    term_name = do[ID]
    term_name = term_name.id
    offspring_names = get_offSpring_ids(term_name)
    offspring_names
    for offspring_names_ in offspring_names:
        conc = get_concept_ids(offspring_names_)
        if len(conc) < 1:
            con.append('nan')
            doid.append(offspring_names_)
            dz_id.append(dz_dic[ID])
        else:
            if len(conc) > 1 :
                for e in conc:
                    con.append(e)
                    doid.append(offspring_names_)
                    dz_id.append(dz_dic[ID])
            if len(conc) == 1:
                con.append(conc)
                doid.append(offspring_names_)
                dz_id.append(dz_dic[ID])
    table['disease id'] = dz_id
    table['concepts'] = con
    table['doid'] = doid
    
    term_name= term_name.replace (':', '-')
    off_results = pd.concat([off_results, table])
off_results = off_results.reset_index(drop=True)

In [13]:
off_results

Unnamed: 0,disease id,concepts,doid
0,dz:72,[C0013264],DOID:11723
1,dz:678,[C0007193],DOID:12930
2,dz:678,,DOID:0110423
3,dz:678,,DOID:0110424
4,dz:678,,DOID:0110425
...,...,...,...
10112,dz:655,[C1332315],DOID:8428
10113,dz:655,,DOID:0050870
10114,dz:655,[C0154061],DOID:8826
10115,dz:655,[C0851140],DOID:8991


In [14]:
# removing null rows with concepts
off_results = off_results[off_results.concepts != 'nan']

In [15]:
# create the disease names list 
# removing the '\r' at the end of the names
diseases = []
for dis in off_results['doid']:
    diseases.append(get_disease_name(dis).rstrip())

In [16]:
# adding the disease names column to the table
off_results['diseases'] = diseases
# adjusting the index
off_results = off_results.reset_index()
off_results = off_results.drop(columns = ['index'])
# removing ('[]') from the concept IDs
for l in range (len(off_results['concepts'])):
    if len (off_results['concepts'][l] ) == 1:
        off_results['concepts'][l] = off_results['concepts'][l][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  off_results['diseases'] = diseases


In [17]:
off_results

Unnamed: 0,disease id,concepts,doid,diseases
0,dz:72,C0013264,DOID:11723,Duchenne muscular dystrophy
1,dz:678,C0007193,DOID:12930,dilated cardiomyopathy
2,dz:678,C0877208,DOID:9997,peripartum cardiomyopathy
3,dz:414,C0004623,DOID:104,bacterial infectious disease
4,dz:414,C0264743,DOID:1586,rheumatic fever
...,...,...,...,...
5341,dz:655,C0861352,DOID:3010,lobular neoplasia
5342,dz:655,C1332315,DOID:8428,breast apocrine carcinoma in situ
5343,dz:655,C0154061,DOID:8826,colon carcinoma in situ
5344,dz:655,C0851140,DOID:8991,cervix uteri carcinoma in situ


###  Ancestors

In [18]:
ancestors_results = pd.DataFrame()
for ID in DOID:
    table = pd.DataFrame()
    conc = []
    doid = []
    con = []
    dz_id = []
    term_name = do[ID]
    term_name = term_name.id
    ancestor_names = get_parent_ids(term_name)
    ancestor_names
    for ancestor_name_ in ancestor_names:
        conc = get_concept_ids(ancestor_name_)
        if len(conc) < 1:
            con.append('nan')
            doid.append(ancestor_name_)
            dz_id.append(dz_dic[ID])
        else:
            if len(conc) > 1 :
                for e in conc:
                    con.append(e)
                    doid.append(ancestor_name_)
                    dz_id.append(dz_dic[ID])
            if len(conc) == 1:
                con.append(conc)
                doid.append(ancestor_name_)
                dz_id.append(dz_dic[ID])
    table['disease id'] = dz_id
    table['concepts'] = con
    table['doid'] = doid
    term_name= term_name.replace (':', '-')
    ancestors_results = pd.concat([ancestors_results, table])
ancestors_results = ancestors_results.reset_index(drop=True)

In [19]:
ancestors_results = ancestors_results[ancestors_results.concepts != 'nan']

In [20]:
ancestors_results

Unnamed: 0,disease id,concepts,doid
0,dz:72,[C0013264],DOID:11723
1,dz:72,[C0026850],DOID:9884
2,dz:72,[C0026848],DOID:423
5,dz:72,[C0026857],DOID:17
7,dz:72,[C0012634],DOID:4
...,...,...,...
5356,dz:656,C0017185,DOID:3119
5358,dz:656,[C0006826],DOID:162
5360,dz:656,[C0012634],DOID:4
5361,dz:655,[C0007099],DOID:8719


In [21]:
# create the disease names list 
# removing the '\r' at the end of the names
diseases = []
for dis in ancestors_results['doid']:
    diseases.append(get_disease_name(dis).rstrip())

In [22]:
# adding the disease names column to the table
ancestors_results['diseases'] = diseases
# adjusting the index
ancestors_results = ancestors_results.reset_index()
ancestors_results = ancestors_results.drop(columns = ['index'])
# removing ('[]') from the concept IDs
for l in range (len(ancestors_results['concepts'])):
    if len (ancestors_results['concepts'][l] ) == 1:
        ancestors_results['concepts'][l] = ancestors_results['concepts'][l][0]

In [23]:
ancestors_results

Unnamed: 0,disease id,concepts,doid,diseases
0,dz:72,C0013264,DOID:11723,Duchenne muscular dystrophy
1,dz:72,C0026850,DOID:9884,muscular dystrophy
2,dz:72,C0026848,DOID:423,myopathy
3,dz:72,C0026857,DOID:17,musculoskeletal system disease
4,dz:72,C0012634,DOID:4,disease
...,...,...,...,...
4248,dz:656,C0017185,DOID:3119,gastrointestinal system cancer
4249,dz:656,C0006826,DOID:162,cancer
4250,dz:656,C0012634,DOID:4,disease
4251,dz:655,C0007099,DOID:8719,in situ carcinoma


In [24]:
# concatenat both tables in one
total_table = pd.concat([ancestors_results, off_results])
total_table = total_table.drop_duplicates()
total_table = total_table.reset_index()
total_table = total_table.drop(columns = ['index'])

In [25]:
total_table #all disease IDs with names

Unnamed: 0,disease id,concepts,doid,diseases
0,dz:72,C0013264,DOID:11723,Duchenne muscular dystrophy
1,dz:72,C0026850,DOID:9884,muscular dystrophy
2,dz:72,C0026848,DOID:423,myopathy
3,dz:72,C0026857,DOID:17,musculoskeletal system disease
4,dz:72,C0012634,DOID:4,disease
...,...,...,...,...
2865,dz:655,C0861352,DOID:3010,lobular neoplasia
2866,dz:655,C1332315,DOID:8428,breast apocrine carcinoma in situ
2867,dz:655,C0154061,DOID:8826,colon carcinoma in situ
2868,dz:655,C0851140,DOID:8991,cervix uteri carcinoma in situ


### disease ID with corresponding concept ID table

In [26]:
concept_dz = pd.DataFrame()
concept_dz['disease id'] = total_table['disease id']
concept_dz['concept id'] = total_table['concepts']
concept_dz

Unnamed: 0,disease id,concept id
0,dz:72,C0013264
1,dz:72,C0026850
2,dz:72,C0026848
3,dz:72,C0026857
4,dz:72,C0012634
...,...,...
2865,dz:655,C0861352
2866,dz:655,C1332315
2867,dz:655,C0154061
2868,dz:655,C0851140


In [27]:
concept_dz.drop_duplicates(inplace=True)

In [28]:
concept_dz

Unnamed: 0,disease id,concept id
0,dz:72,C0013264
1,dz:72,C0026850
2,dz:72,C0026848
3,dz:72,C0026857
4,dz:72,C0012634
...,...,...
2865,dz:655,C0861352
2866,dz:655,C1332315
2867,dz:655,C0154061
2868,dz:655,C0851140


In [29]:
concept_dz['Indicator'] = 1
concept_dz.head()

Unnamed: 0,disease id,concept id,Indicator
0,dz:72,C0013264,1
1,dz:72,C0026850,1
2,dz:72,C0026848,1
3,dz:72,C0026857,1
4,dz:72,C0012634,1


In [30]:
df = pd.pivot_table(concept_dz, index='concept id', columns='disease id', values='Indicator')
df[df.isna()] = 0
df = df.astype(int)
df

disease id,dz:101,dz:1011,dz:1012,dz:102,dz:1036,dz:104,dz:1045,dz:1047,dz:1048,dz:1049,...,dz:939,dz:943,dz:948,dz:95,dz:955,dz:966,dz:979,dz:98,dz:983,dz:984
concept id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001247,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001308,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C4273988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C4288544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C4289586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C4289808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
total_results.to_csv('../data/results//DOIDs_CONCEPTs_diseases.csv')

In [32]:
total_table.to_csv('../data/results/All_DOIDs_CONCEPTs_diseases.csv')

In [33]:
df.to_csv('../data/results/Concept_dz.csv')