In [103]:
import pandas as pd
import os
import gzip
import io
import json
import xml.etree.ElementTree as ET

## Symptoms

####### Next, we created a symptom set of 438 MeSH terms by finding all descendants of D012816 (Signs and Symptoms)  (mesh notebook). [https://thinklab.com/discussion/mining-knowledge-from-medline-articles-and-their-indexed-mesh-terms/67]

In [104]:
# Read MESH symptoms input used in Daniels cooccurrence dis-sym data mining (MESH 2015)
# source: https://github.com/dhimmel/medline/blob/gh-pages/symptoms.ipynb
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'
symptom_df = pd.read_table(url)
# len(symptom_df.mesh_id.unique()) # 438
symptom_df['mesh_id'] = 'MESH:' + symptom_df['mesh_id'].astype(str)
symptom_df.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,MESH:D000006,"Abdomen, Acute",1
1,MESH:D000270,Adie Syndrome,0


In [121]:
# Read NGLY1 MESH symptoms
path = '/home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/ngly1/data/'
ngly1_sym_df = pd.read_table('{}manual-curation/ngly1-mesh.tsv'.format(path), header=None)
ngly1_sym_df.columns = ['mesh_id', 'mesh_name']
ngly1_sym_df

Unnamed: 0,mesh_id,mesh_name
0,MESH:D009123,Muscle Hypotonia
1,MESH:C562827,Alacrima
2,MESH:D003248,Constipation
3,MESH:D012021,Abnormal Deep Tendon Reflex
4,MESH:D008831,Microcephaly
5,MESH:D005317,IUGR (intrauterine growth restriction)
6,MESH:D065906,Hyperlactatemia
7,MESH:D012640,Seizures
8,MESH:D001072,Apraxias
9,MESH:D003320,Corneal Ulcer


In [123]:
# NGLY1 symptoms not included in Daniel's symptoms input list
mesh_sym_set = set(list(symptom_df.mesh_id))
ngly1_sym_set = set(list(ngly1_sym_df.mesh_id))
intersection = (mesh_sym_set & ngly1_sym_set)
ngly1_new_set = (ngly1_sym_set - intersection)
print('The number of common symptoms is: {}.\nThe number of symptoms from NGLY1 not included in Daniel"s mining is: {}'.format(len(intersection), len(ngly1_new_set)))
data = {'mesh_id': list(ngly1_new_set)}
ngly1_new_df = pd.DataFrame.from_dict(data)
ngly1_new_df = ngly1_new_df.merge(ngly1_sym_df, how='left', on='mesh_id')
print('which are those that are not childs from "signs or symptoms": {}'.format(ngly1_new_df))

The number of common symptoms is: 6.
The number of symptoms from NGLY1 not included in Daniel"s mining is: 9
which are those that are not childs from "signs or symptoms":         mesh_id                               mesh_name
0  MESH:C562827                                Alacrima
1  MESH:D007567                       Neonatal Jaundice
2  MESH:D012600                               Scoliosis
3  MESH:D005317  IUGR (intrauterine growth restriction)
4  MESH:D013285                              Strabismus
5  MESH:D008831                            Microcephaly
6  MESH:D003320                           Corneal Ulcer
7  MESH:D017043                              Chalazions
8  MESH:D008103                          Liver Fibrosis


In [107]:
# Read MESH terms 2017 release
# Read MeSH xml release
xml_path = os.path.join('download', 'desc2017.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [108]:
# Extract mesh terms
term_dicts = list()
for descriptor in root:
    for concept in descriptor.findall('ConceptList/Concept'):
        for term in concept.findall('TermList/Term'):
            term_dict = {
                'DescriptorUI': descriptor.findtext('DescriptorUI'),
                'ConceptUI': concept.findtext('ConceptUI'),
                'TermUI': term.findtext('TermUI'),
                'TermName': term.findtext('String')
            }
            term_dict.update(concept.attrib)
            term_dict.update(term.attrib)
            term_dicts.append(term_dict)

columns = ['DescriptorUI', 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName',
           'ConceptPreferredTermYN', 'IsPermutedTermYN', 'LexicalTag', 'RecordPreferredTermYN']
term_df = pd.DataFrame(term_dicts)[columns]
term_df.to_csv('data/mesh2017-descriptor-terms.tsv', index=False, sep='\t')
term_df.head(2)

Unnamed: 0,DescriptorUI,ConceptUI,PreferredConceptYN,TermUI,TermName,ConceptPreferredTermYN,IsPermutedTermYN,LexicalTag,RecordPreferredTermYN
0,D000001,M0000001,Y,T000002,Calcimycin,Y,N,NON,Y
1,D000001,M0353609,N,T000001,A-23187,Y,N,LAB,N


In [109]:
# Test whether MeSH term names are unique
len(term_df) == len(set(term_df.TermName))

True

In [110]:
# Parse MeSH xml release
terms = list()

for elem in root:
    term = dict()
    term['mesh_id'] = elem.findtext('DescriptorUI')
    term['mesh_name'] = elem.findtext('DescriptorName/String')
    term['semantic_types'] = list({x.text for x in elem.findall(
        'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})
    term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
    terms.append(term)

In [111]:
path = os.path.join('data', 'mesh2017.json')
with open(path, 'w') as write_file:
    json.dump(terms, write_file, indent=2)

In [112]:
# Read mesh
path = os.path.join('data', 'mesh2017.json')
with open(path) as read_file:
    mesh = json.load(read_file)

mesh_df = pd.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]
mesh_df.to_csv('data/mesh2017-terms.tsv', sep='\t', index=False)

In [113]:
# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
    mesh_id = term['mesh_id']
    mesh_name = term['mesh_name']
    for tree_number in term['tree_numbers']:
        rows.append([mesh_id, mesh_name, tree_number])

tn_df = pd.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])
tn_df.to_csv('data/mesh2017-tree-numbers.tsv', sep='\t', index=False)

In [114]:
# len(mesh_df.mesh_id.unique()) # 28472
mesh_df.head(2)

Unnamed: 0,mesh_id,mesh_name
0,D000001,Calcimycin
1,D000002,Temefos


In [115]:
mesh_df['mesh_id'] = 'MESH:' + mesh_df['mesh_id'].astype(str)
mesh_df.head(2)

Unnamed: 0,mesh_id,mesh_name
0,MESH:D000001,Calcimycin
1,MESH:D000002,Temefos


In [116]:
# different terms mesh 2015-2017
mesh2017_df = mesh_df
# Read MeSH 2015 terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh2015_df = pd.read_table(url)
mesh2015_df['mesh_id'] = 'MESH:' + mesh2015_df['mesh_id'].astype(str)
mesh2015_set = set(list(mesh2015_df.mesh_id))
mesh2017_set = set(list(mesh2017_df.mesh_id))
intersection = (mesh2015_set & mesh2017_set)
mesh2017_new_set = (mesh2017_set - intersection)
print('New entries from the new release: {}'.format(len(mesh2017_new_set)))

New entries from the new release: 1068


In [120]:
# NGLY1 symptoms not included in MESH 2015
mesh_sym_set = set(list(mesh2015_df.mesh_id))
intersection = (mesh_sym_set & ngly1_sym_set)
ngly1_new_set = (ngly1_sym_set - intersection)
print('The number of common symptoms is: {}.\nThe number of symptoms from NGLY1 not included in MESH 2015 is: {}, which is: {}'.format(len(intersection), len(ngly1_new_set), ngly1_new_set))


The number of common symptoms is: 14.
The number of symptoms from NGLY1 not included in MESH 2015 is: 1, which is: {'MESH:C562827'}


In [118]:
# NGLY1 symptoms not included in MESH 2017
mesh_sym_set = set(list(mesh2017_df.mesh_id))
intersection = (mesh_sym_set & ngly1_sym_set)
ngly1_new_set = (ngly1_sym_set - intersection)
print('The number of common symptoms is: {}.\nThe number of symptoms from NGLY1 not included in MESH 2017 is: {}, which is: {}'.format(len(intersection), len(ngly1_new_set), ngly1_new_set))

The number of common symptoms is: 14.
The number of symptoms from NGLY1 not included in MESH 2017 is: 1, which is: {'MESH:C562827'}


In [119]:
# different symptoms terms 2015-2017