In [29]:
import pandas as pd
import os
import gzip
import io
import json
import xml.etree.ElementTree as ET

## Symptoms

####### Next, we created a symptom set of 438 MeSH terms by finding all descendants of D012816 (Signs and Symptoms)  (mesh notebook). [https://thinklab.com/discussion/mining-knowledge-from-medline-articles-and-their-indexed-mesh-terms/67]

In [8]:
# Read MESH symptoms input used in Daniels cooccurrence dis-sym data mining (MESH 2015)
# source: https://github.com/dhimmel/medline/blob/gh-pages/symptoms.ipynb
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'
symptom_df = pd.read_table(url)
# len(symptom_df.mesh_id.unique()) # 438
symptom_df.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,D000006,"Abdomen, Acute",1
1,D000270,Adie Syndrome,0


In [9]:
# Read NGLY1 MESH symptoms
path = '/home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/ngly1/data/'
ngly1_sym_df = pd.read_table('{}manual-curation/ngly1-mesh.tsv'.format(path), header=None)
ngly1_sym_df.columns = ['mesh_id', 'mesh_name']
ngly1_sym_df.head(2)

Unnamed: 0,mesh_id,mesh_name
0,MESH:D009123,Muscle Hypotonia
1,MESH:C562827,Alacrima


In [25]:
#  NGLY1 symptoms not included in Daniel's symptoms input list
mesh_sym_set = set(list(symptom_df.mesh_id))
ngly1_sym_set = set(list(ngly1_sym_df.mesh_id))
intersection = (mesh_sym_set & ngly1_sym_set)
ngly1_new_set = (ngly1_sym_set - intersection)
print('The number of common symptoms is: {}.\nThe number of symptoms from NGLY1 not included in Daniel"s mining is: {}'.format(len(intersection), len(ngly1_new_set)))

The number of common symptoms is: 0.
The number of symptoms from NGLY1 not included in Daniel"s mining is: 15


In [30]:
# Read MESH terms 2017 release
# Read MeSH xml release
xml_path = os.path.join('download', 'desc2017.gz')
with gzip.open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [33]:
# Extract mesh terms
term_dicts = list()
for descriptor in root:
    for concept in descriptor.findall('ConceptList/Concept'):
        for term in concept.findall('TermList/Term'):
            term_dict = {
                'DescriptorUI': descriptor.findtext('DescriptorUI'),
                'ConceptUI': concept.findtext('ConceptUI'),
                'TermUI': term.findtext('TermUI'),
                'TermName': term.findtext('String')
            }
            term_dict.update(concept.attrib)
            term_dict.update(term.attrib)
            term_dicts.append(term_dict)

columns = ['DescriptorUI', 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName',
           'ConceptPreferredTermYN', 'IsPermutedTermYN', 'LexicalTag', 'PrintFlagYN', 'RecordPreferredTermYN']
term_df = pd.DataFrame(term_dicts)[columns]
term_df.to_csv('data/mesh2017-descriptor-terms.tsv', index=False, sep='\t')

KeyError: "['PrintFlagYN'] not in index"

In [None]:
# Test whether MeSH term names are unique
len(term_df) == len(set(term_df.TermName))

In [34]:
# Parse MeSH xml release
terms = list()

for elem in root:
    term = dict()
    term['mesh_id'] = elem.findtext('DescriptorUI')
    term['mesh_name'] = elem.findtext('DescriptorName/String')
    term['semantic_types'] = list({x.text for x in elem.findall(
        'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})
    term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
    terms.append(term)

In [36]:
path = os.path.join('data', 'mesh2017.json')
with open(path, 'w') as write_file:
    json.dump(terms, write_file, indent=2)

In [38]:
# Read mesh
path = os.path.join('data', 'mesh2017.json')
with open(path) as read_file:
    mesh = json.load(read_file)

mesh_df = pd.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]
mesh_df.to_csv('data/mesh2017-terms.tsv', sep='\t', index=False)

In [39]:
# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
    mesh_id = term['mesh_id']
    mesh_name = term['mesh_name']
    for tree_number in term['tree_numbers']:
        rows.append([mesh_id, mesh_name, tree_number])

tn_df = pd.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])
tn_df.to_csv('data/mesh2017-tree-numbers.tsv', sep='\t', index=False)