In [1]:
import pandas as pd

### Get list of symptom nodes

In [2]:
# read list of symptom nodes from hetionet
commit = 'a7036a37302973b15ab949aab4056d9bc062910e'
url = 'https://raw.githubusercontent.com/dhimmel/mesh/' + commit + '/data/symptoms.tsv'
symptoms_df = pd.read_table(url)
symptoms_df = symptoms_df.rename(columns={'mesh_id':'mesh_code', 'mesh_name':'mesh_term'})
symptoms_df = symptoms_df.assign(mesh_code = symptoms_df.mesh_code.apply(lambda x: 'MESH:' + x))
symptoms_df.head(2)

Unnamed: 0,mesh_code,mesh_term,in_hsdn
0,MESH:D000006,"Abdomen, Acute",1
1,MESH:D000270,Adie Syndrome,0


In [3]:
# read list of mesh symptoms in edges from hetionet
commit = '60d611892bf387b5b23c5f2e2e3bc472cfce85f3'
url = 'https://raw.githubusercontent.com/dhimmel/medline/' + commit + '/data/disease-symptom-cooccurrence.tsv'
edges_df = pd.read_table(url)
edges_df = edges_df.rename(columns={'mesh_id':'mesh_code', 'mesh_name':'mesh_term'})
edges_df = (edges_df.assign(mesh_code = edges_df.mesh_code.apply(lambda x: 'MESH:' + x)))
edges_df.head(2)

Unnamed: 0,doid_code,doid_name,mesh_code,mesh_term,cooccurrence,expected,enrichment,odds_ratio,p_fisher
0,DOID:10652,Alzheimer's disease,MESH:D004314,Down Syndrome,800,35.619601,22.459544,39.918352,0.0
1,DOID:10652,Alzheimer's disease,MESH:D008569,Memory Disorders,1593,76.580532,20.801631,41.885877,0.0


In [4]:
# are all in the total list of symptoms? yes
(
        len(symptoms_df.mesh_code.unique()), 
        len(edges_df.mesh_code.unique()), 
        len(set(list(symptoms_df.mesh_code)) & set(list(edges_df.mesh_code)))
)

(438, 426, 426)

In [5]:
# normalize hetionet symptom nodes from mesh to hp
# load hp2mesh from HPO
path = "/home/nuria/workspace/repurposing/rephetio-su/orphanet/data/"
hp2mesh_df = pd.read_table("{}hp-hp2mesh-mappings.tsv".format(path))
hp2mesh_df.head(2)

Unnamed: 0,hp_code,hp_term,mesh_code
0,HP:0009829,Phocomelia,MESH:D004480
1,HP:0012622,Chronic kidney disease,MESH:D051436


In [6]:
symptoms_df = symptoms_df.merge(hp2mesh_df, on='mesh_code', how='left')
symptoms_df.head(5)

Unnamed: 0,mesh_code,mesh_term,in_hsdn,hp_code,hp_term
0,MESH:D000006,"Abdomen, Acute",1,,
1,MESH:D000270,Adie Syndrome,0,,
2,MESH:D000326,Adrenoleukodystrophy,0,,
3,MESH:D000334,Aerophagy,1,,
4,MESH:D000370,Ageusia,1,HP:0000224,Decreased taste sensation


In [7]:
symptoms_df = (symptoms_df
    .assign(normalized = symptoms_df
        [["hp_code", "mesh_code"]]
        .apply(lambda row: row["hp_code"] if not pd.isnull(row["hp_code"]) else row["mesh_code"], axis = 1)
    )
)
symptoms_df.head()

Unnamed: 0,mesh_code,mesh_term,in_hsdn,hp_code,hp_term,normalized
0,MESH:D000006,"Abdomen, Acute",1,,,MESH:D000006
1,MESH:D000270,Adie Syndrome,0,,,MESH:D000270
2,MESH:D000326,Adrenoleukodystrophy,0,,,MESH:D000326
3,MESH:D000334,Aerophagy,1,,,MESH:D000334
4,MESH:D000370,Ageusia,1,HP:0000224,Decreased taste sensation,HP:0000224


In [8]:
# alternative way of normalizing ids in one column
symptoms_df['alternative'] = (symptoms_df[["hp_code","mesh_code"]]
                              .apply(lambda x: x.hp_code if not pd.isnull(x.hp_code) else x.mesh_code, axis=1)
                             
                             )
symptoms_df.head(2)

Unnamed: 0,mesh_code,mesh_term,in_hsdn,hp_code,hp_term,normalized,alternative
0,MESH:D000006,"Abdomen, Acute",1,,,MESH:D000006,MESH:D000006
1,MESH:D000270,Adie Syndrome,0,,,MESH:D000270,MESH:D000270


In [9]:
(symptoms_df["alternative"] == symptoms_df["normalized"]).all()

True

In [10]:
# read list of symptoms from orphanet
url = '/home/nuria/workspace/repurposing/rephetio-su/orphanet/data/orphanet-mondo-disease-symptom.tsv'
orpha_df = pd.read_table(url)
orpha_df.head(2)

Unnamed: 0,orphanet_code,orphanet_term,hp_code,hp_term
0,DOID:10923,Sickle cell anemia,HP:0004870,Chronic hemolytic anemia
1,DOID:10923,Sickle cell anemia,HP:0001878,Hemolytic anemia


In [11]:
# merge all symptom sets
hetionet_df = symptoms_df[['normalized', 'mesh_term']]
orphanet_df = orpha_df[['hp_code', 'hp_term']]
hetionet_df = hetionet_df.rename(columns={'normalized':'symptom_id', 'mesh_term':'symptom_name'})
orphanet_df = orphanet_df.rename(columns={'hp_code':'symptom_id', 'hp_term':'symptom_name'})
all_df = pd.concat([hetionet_df, orphanet_df])
all_symptoms_df = all_df.drop_duplicates()
print('#symptoms: {}'.format(len(all_symptoms_df)))
all_symptoms_df.head()

#symptoms: 4997


Unnamed: 0,symptom_id,symptom_name
0,MESH:D000006,"Abdomen, Acute"
1,MESH:D000270,Adie Syndrome
2,MESH:D000326,Adrenoleukodystrophy
3,MESH:D000334,Aerophagy
4,HP:0000224,Ageusia


In [12]:
all_symptoms_df.to_csv('data/symptoms.tsv', sep='\t', index=False, header=True)

### Normalize symptoms in symptom edges from hetionet

In [13]:
edges_df.doid_code.nunique(), edges_df.mesh_code.nunique()

(133, 426)

In [14]:
# map
edges_df = (edges_df
                .merge(hp2mesh_df, on = 'mesh_code', how = 'left')
           )
edges_df.doid_code.nunique(), edges_df.mesh_code.nunique()

(133, 426)

In [15]:
# integrate
edges_df = (edges_df
                .assign(symptom_id = 
                        edges_df[["hp_code","mesh_code"]]
                            .apply(lambda row: row.hp_code if not pd.isnull(row.hp_code) else row.mesh_code, axis = 1))
           )

In [16]:
edges_df.head()

Unnamed: 0,doid_code,doid_name,mesh_code,mesh_term,cooccurrence,expected,enrichment,odds_ratio,p_fisher,hp_code,hp_term,symptom_id
0,DOID:10652,Alzheimer's disease,MESH:D004314,Down Syndrome,800,35.619601,22.459544,39.918352,0.0,,,MESH:D004314
1,DOID:10652,Alzheimer's disease,MESH:D008569,Memory Disorders,1593,76.580532,20.801631,41.885877,0.0,HP:0002354,Memory impairment,HP:0002354
2,DOID:10652,Alzheimer's disease,MESH:D011595,Psychomotor Agitation,334,15.235665,21.922247,35.277329,0.0,,,MESH:D011595
3,DOID:10652,Alzheimer's disease,MESH:D000647,Amnesia,307,14.061215,21.833106,34.890099,4.277452e-314,,,MESH:D000647
4,DOID:10652,Alzheimer's disease,MESH:D006816,Huntington Disease,255,12.130614,21.021195,32.630035,8.215868e-256,,,MESH:D006816


In [17]:
edges_df = (edges_df
               .assign(symptom_name =
                           edges_df[['mesh_term', 'hp_term']]
                               .apply(lambda row: row.hp_term if not pd.isnull(row.hp_term) else row.mesh_term, axis = 1)
                      )
           )

In [18]:
edges_df.head()

Unnamed: 0,doid_code,doid_name,mesh_code,mesh_term,cooccurrence,expected,enrichment,odds_ratio,p_fisher,hp_code,hp_term,symptom_id,symptom_name
0,DOID:10652,Alzheimer's disease,MESH:D004314,Down Syndrome,800,35.619601,22.459544,39.918352,0.0,,,MESH:D004314,Down Syndrome
1,DOID:10652,Alzheimer's disease,MESH:D008569,Memory Disorders,1593,76.580532,20.801631,41.885877,0.0,HP:0002354,Memory impairment,HP:0002354,Memory impairment
2,DOID:10652,Alzheimer's disease,MESH:D011595,Psychomotor Agitation,334,15.235665,21.922247,35.277329,0.0,,,MESH:D011595,Psychomotor Agitation
3,DOID:10652,Alzheimer's disease,MESH:D000647,Amnesia,307,14.061215,21.833106,34.890099,4.277452e-314,,,MESH:D000647,Amnesia
4,DOID:10652,Alzheimer's disease,MESH:D006816,Huntington Disease,255,12.130614,21.021195,32.630035,8.215868e-256,,,MESH:D006816,Huntington Disease


In [19]:
edges_df = (
    edges_df
    [['doid_code', 'doid_name', 'symptom_id', 
      'symptom_name', 'cooccurrence', 'expected', 
      'enrichment', 'odds_ratio', 'p_fisher']]
           )

In [20]:
edges_df.head()

Unnamed: 0,doid_code,doid_name,symptom_id,symptom_name,cooccurrence,expected,enrichment,odds_ratio,p_fisher
0,DOID:10652,Alzheimer's disease,MESH:D004314,Down Syndrome,800,35.619601,22.459544,39.918352,0.0
1,DOID:10652,Alzheimer's disease,HP:0002354,Memory impairment,1593,76.580532,20.801631,41.885877,0.0
2,DOID:10652,Alzheimer's disease,MESH:D011595,Psychomotor Agitation,334,15.235665,21.922247,35.277329,0.0
3,DOID:10652,Alzheimer's disease,MESH:D000647,Amnesia,307,14.061215,21.833106,34.890099,4.277452e-314
4,DOID:10652,Alzheimer's disease,MESH:D006816,Huntington Disease,255,12.130614,21.021195,32.630035,8.215868e-256


In [21]:
# save
edges_df.to_csv('data/medline_edges.tsv', sep='\t', index=False, header=True)

In [22]:
edges_df.doid_code.nunique(), edges_df.symptom_id.nunique()

(133, 521)