In [1]:
from xml.etree import cElementTree as elemtree
from datetime import date

"""
Use this to parse XML from MeSH (Medical Subject Headings). More information 
on the format at: http://www.ncbi.nlm.nih.gov/mesh
End users will primarily want to call the `parse_mesh` function and do something
with the output.
"""

def parse_mesh(filename):
    """Parse a mesh file, successively generating
    `DescriptorRecord` instance for subsequent processing."""
    for _evt, elem in elemtree.iterparse(filename):
        if elem.tag == 'DescriptorRecord':
            yield DescriptorRecord.from_xml_elem(elem)

def date_from_mesh_xml(xml_elem):
    year = xml_elem.find('./Year').text
    month = xml_elem.find('./Month').text
    day = xml_elem.find('./Day').text
    return date(int(year), int(month), int(day))

class PharmacologicalAction(object):
    """A pharmacological action, denoting the effects of a MeSH descriptor."""
    
    def __init__(self, descriptor_ui):
        self.descriptor_ui = descriptor_ui
    
    @classmethod
    def from_xml_elem(cls, elem):
        descriptor_ui = elem.find('./DescriptorReferredTo/DescriptorUI')
        return cls(descriptor_ui)

class SlotsToNoneMixin(object):
    def __init__(self, **kwargs):
        for attr in self.__slots__:
            setattr(self, attr, kwargs.get(attr, None))
    
    def __repr__(self):
        attrib_repr = ', '.join(u'%s=%r' % (attr, getattr(self, attr)) for attr in self.__slots__)
        return self.__class__.__name__ + '(' + attrib_repr + ')'

class Term(SlotsToNoneMixin):
    """A term from within a MeSH concept."""

    __slots__ = ('term_ui', 'string', 'is_concept_preferred', 'is_record_preferred',
      'is_permuted', 'lexical_tag', 'date_created', 'thesaurus_list')
    
    @classmethod
    def from_xml_elem(cls, elem):
        term = cls()
        term.is_concept_preferred = elem.get('ConceptPreferredTermYN', None) == 'Y'
        term.is_record_preferred = elem.get('RecordPreferredTermYN', None) == 'Y'
        term.is_permuted = elem.get('IsPermutedTermYN', None) == 'Y'
        term.lexical_tag = elem.get('LexicalTag')
        for child_elem in elem:
            if child_elem.tag == 'TermUI':
                term.term_ui = child_elem.text
            elif child_elem.tag == 'String':
                term.string = child_elem.text
                #term.name = [th_elem.text for th_elem in child_elem]
            elif child_elem.tag == 'DateCreated':
                term.date_created = date_from_mesh_xml(child_elem)
            elif child_elem.tag == 'ThesaurusIDlist':
                term.thesaurus_list = [th_elem.text for th_elem in child_elem]
        return term

class SemanticType(SlotsToNoneMixin):
    __slots__ = ('ui', 'name')
    
    @classmethod
    def from_xml_elem(cls, elem):
        sem_type = cls()
        for child_elem in elem:
            if child_elem.tag == 'SemanticTypeUI':
                sem_type.ui = child_elem.text
            elif child_elem.tag == 'SemanticTypeName':
                sem_type.name = child_elem.text

class Concept(SlotsToNoneMixin):
    """A concept within a MeSH Descriptor."""
    __slots__ = ( 'ui', 'name', 'is_preferred', 'umls_ui', 'casn1_name', 'registry_num', 
      'scope_note', 'sem_types', 'terms')
    
    @classmethod
    def from_xml_elem(cls, elem):
        concept = cls()
        concept.is_preferred = elem.get('PreferredConceptYN', None) == 'Y'
        for child_elem in elem:
            if child_elem.tag == 'ConceptUI':
                concept.ui = child_elem.text
            elif child_elem.tag == 'ConceptName':
                concept.name = child_elem.find('./String').text
            elif child_elem.tag == 'ConceptUMLSUI':
                concept.umls_ui
            elif child_elem.tag == 'CASN1Name':
                concept.casn1_name = child_elem.text
            elif child_elem.tag == 'RegistryNumber':
                concept.registry_num = child_elem.text
            elif child_elem.tag == 'ScopeNote':
                concept.scope_note = child_elem.text
            elif child_elem.tag == 'SemanticTypeList':
                concept.sem_types = [SemanticType.from_xml_elem(st_elem)
                  for st_elem in child_elem.findall('SemanticType')]
            elif child_elem.tag == 'TermList':
                concept.terms = [Term.from_xml_elem(term_elem)
                  for term_elem in child_elem.findall('Term')]
        return concept

class DescriptorRecord(SlotsToNoneMixin):
    "A MeSH Descriptor Record."""
    
    __slots__ = ('ui', 'name', 'date_created', 'date_revised', 'pharm_actions', 
      'tree_numbers', 'concepts')
    
    @classmethod
    def from_xml_elem(cls, elem):
        rec = cls()
        for child_elem in elem:
            if child_elem.tag == 'DescriptorUI':
                rec.ui = child_elem.text
            elif child_elem.tag == 'DescriptorName':
                rec.name = child_elem.find('./String').text
            elif child_elem.tag == 'DateCreated':
                rec.date_created = date_from_mesh_xml(child_elem)
            elif child_elem.tag == 'DateRevised':
                rec.date_revised = date_from_mesh_xml(child_elem)
            elif child_elem.tag == 'TreeNumberList':
                rec.tree_numbers = [tn_elem.text
                  for tn_elem in child_elem.findall('TreeNumber')]
            elif child_elem.tag == 'ConceptList':
                rec.concepts = [Concept.from_xml_elem(c_elem) 
                  for c_elem in child_elem.findall('Concept')]
            elif child_elem.tag == 'PharmacologicalActionList':
                rec.pharm_actions = [PharmacologicalAction.from_xml_elem(pa_elem) 
                  for pa_elem in child_elem.findall('PharmacologicalAction')]
        return rec

In [37]:
#data=parse_mesh('C:\\Users\\Richard.Geoghegan\\Documents\\mdm_db\\edw\\dev\\input\\desc2017.xml')
data=parse_mesh('C:\\Users\\Richard.Geoghegan\\Downloads\\desc2023.xml')
#D002292=[item for item in data if item.ui=='D002292']

In [32]:
curr=next(data)
curr

StopIteration: 

In [31]:
#counter=1
curr=[next(data)]

def xml_to_list(counter):
    try:
        while True:
            counter +=1
            if (counter % 1000==0):
                print('Progress report...', counter, )
                
            curr=[next(data)]
    except StopIteration:
        pass
    finally:
        return counter

count=xml_to_list(0)
count


Progress report... 1000
Progress report... 2000
Progress report... 3000
Progress report... 4000
Progress report... 5000
Progress report... 6000
Progress report... 7000
Progress report... 8000
Progress report... 9000
Progress report... 10000
Progress report... 11000
Progress report... 12000
Progress report... 13000
Progress report... 14000
Progress report... 15000
Progress report... 16000
Progress report... 17000
Progress report... 18000
Progress report... 19000
Progress report... 20000
Progress report... 21000
Progress report... 22000
Progress report... 23000
Progress report... 24000
Progress report... 25000
Progress report... 26000
Progress report... 27000
Progress report... 28000
Progress report... 29000
Progress report... 30000


30452

In [16]:
#curr=[next(data)]
#curr
ui, name, conc = [[item.ui, item.name, item.concepts, item.tree_numbers or 'EMPTY'] for item in curr ]

ValueError: not enough values to unpack (expected 3, got 1)

In [3]:
treetops = ['F','C']
data=parse_mesh('C:\\Users\\Richard.Geoghegan\\Downloads\\desc2023.xml')

res=[]
#curr=[next(data)]

def xml_to_list(counter):
    try:
        while True:
            curr=[next(data)]
            counter +=1
            #ui, name, conc = [[item.ui, item.name, item.concepts, item.tree_numbers or 'EMPTY'] for item in curr ][0]
            for i, val in enumerate([[item.ui, item.name, item.concepts, item.tree_numbers] 
                                     for item in curr 
                                     if any(s for s in str(item.tree_numbers or 'EMPTY') if any(xs in s for xs in treetops))]):
                #print('start: ', i, conc)
                ui=val[0]
                #print(ui)
                name=val[1]
                conc=val[2]
                #print(conc)
                #tree=val[2]
                tree = [item for item in val[3] if any(s for s in item if any(xs in s for xs in treetops))]
                #top = min([s.split('.')[0] for s in tree])
                top = list(set(item.split('.')[0] for item in tree))
                #print(top)
                for j in range(len(conc)):
                    #print('next: ',i,j)
                    for k in range(len(conc[j].terms)):
                        #print(i,j,k)
                        #print(counter, top, ui, name, conc[j].terms[k].term_ui, conc[j].terms[k].string )
                        res.append([top, ui, name, conc[j].terms[k].term_ui, conc[j].terms[k].string])
                        #curr=[next(data)]
                        if (counter % 1000==0):
                            print('Progress report...', counter, i,j,k)
    except StopIteration:
        print(ui)
        print(name)
        print(conc)
        print(tree)
        print(top)
        pass
    finally:
        return res

res=xml_to_list(0)

import csv
import os

data_dir = 'C:\\Users\\Richard.Geoghegan\\Documents\\NLP\\MeSH'
data_file='mesh_terms_treetop_test'
data_file = os.path.join(data_dir, data_file+'.csv')
data_file

with open(data_file, 'w', newline='') as f:
    wr = csv.writer(f, delimiter ='|')
    wr.writerows(res)

Progress report... 5000 0 0 0
Progress report... 5000 0 0 1
Progress report... 5000 0 0 2
Progress report... 5000 0 0 3
Progress report... 5000 0 0 4
Progress report... 5000 0 0 5
Progress report... 5000 0 0 6
Progress report... 5000 0 0 7
Progress report... 5000 0 0 8
Progress report... 5000 0 0 9
Progress report... 5000 0 1 0
Progress report... 5000 0 1 1
Progress report... 5000 0 1 2
Progress report... 5000 0 1 3
Progress report... 5000 0 1 4
Progress report... 5000 0 1 5
Progress report... 5000 0 1 6
Progress report... 5000 0 2 0
Progress report... 5000 0 2 1
Progress report... 5000 0 2 2
Progress report... 5000 0 2 3
Progress report... 5000 0 2 4
Progress report... 5000 0 2 5
Progress report... 5000 0 2 6
Progress report... 5000 0 3 0
Progress report... 5000 0 3 1
Progress report... 5000 0 3 2
Progress report... 5000 0 3 3
Progress report... 5000 0 3 4
Progress report... 5000 0 3 5
Progress report... 5000 0 4 0
Progress report... 5000 0 4 1
Progress report... 5000 0 4 2
Progress r

In [10]:
#Glioblastoma
#[item for item in res if item[1]=='D005909']
#Non-alcoholic Fatty Liver Disease
[item for item in res if item[1]=='D065626']


[[['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747320',
  'Non-alcoholic Fatty Liver Disease'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747320',
  'Non alcoholic Fatty Liver Disease'],
 [['C06'], 'D065626', 'Non-alcoholic Fatty Liver Disease', 'T747417', 'NAFLD'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747319',
  'Nonalcoholic Fatty Liver Disease'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747321',
  'Fatty Liver, Nonalcoholic'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747321',
  'Fatty Livers, Nonalcoholic'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747321',
  'Liver, Nonalcoholic Fatty'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747321',
  'Livers, Nonalcoholic Fatty'],
 [['C06'],
  'D065626',
  'Non-alcoholic Fatty Liver Disease',
  'T747321',
  'Nonalcoholic Fatty Liver'],
 [['C06'],
  'D065626',
  'Non-a

In [4]:
ui, name, conc = [[item.ui, item.name, item.concepts, item.tree_numbers or 'EMPTY'] for item in next(data) ]

NameError: name 'data' is not defined

In [629]:
[item[-1].replace(',','').split() for item in res if item[1]=='D000163']

[['Acquired', 'Immunodeficiency', 'Syndrome'],
 ['Immunologic', 'Deficiency', 'Syndrome', 'Acquired'],
 ['Acquired', 'Immune', 'Deficiency', 'Syndrome'],
 ['Acquired', 'Immuno-Deficiency', 'Syndrome'],
 ['Acquired', 'Immuno', 'Deficiency', 'Syndrome'],
 ['Acquired', 'Immuno-Deficiency', 'Syndromes'],
 ['Immuno-Deficiency', 'Syndrome', 'Acquired'],
 ['Immuno-Deficiency', 'Syndromes', 'Acquired'],
 ['Syndrome', 'Acquired', 'Immuno-Deficiency'],
 ['Syndromes', 'Acquired', 'Immuno-Deficiency'],
 ['Immunodeficiency', 'Syndrome', 'Acquired'],
 ['Acquired', 'Immunodeficiency', 'Syndromes'],
 ['Immunodeficiency', 'Syndromes', 'Acquired'],
 ['Syndrome', 'Acquired', 'Immunodeficiency'],
 ['Syndromes', 'Acquired', 'Immunodeficiency'],
 ['AIDS']]

In [567]:
xss=[item[-1].replace(',','').split() for item in res if item[1]=='D002292']
list(set().union(*xss))


['Collecting',
 'Nephroid',
 'Tumor',
 'Duct',
 'Hypernephroid',
 'Cell',
 'Kidneys',
 'the',
 'Renal',
 'Sarcomatoid',
 'Kidney',
 'Chromophil',
 'Adenocarcinoma',
 'Carcinomas',
 'Hypernephromas',
 'of',
 'Carcinoma',
 'Chromophobe',
 'Of',
 'Clear',
 'Cancers',
 'Adenocarcinomas',
 'Cancer',
 '(Kidney)',
 'Grawitz',
 'Papillary',
 'Hypernephroma']

In [538]:
dict = {}
for elem in res:
    if elem[1] not in dict:
        dict[elem[1]] = ''
    dict[elem[1]].append(elem[2:])

dict

#for key in dict:
#  dict[key] = [sum(i) for i in zip(*dict[key])]



{'D000006': [['Abdomen, Acute', 'T000013', 'Abdomen, Acute'],
  ['Abdomen, Acute', 'T000013', 'Abdomens, Acute'],
  ['Abdomen, Acute', 'T000013', 'Acute Abdomen'],
  ['Abdomen, Acute', 'T000013', 'Acute Abdomens']],
 'D000007': [['Abdominal Injuries', 'T000015', 'Abdominal Injuries'],
  ['Abdominal Injuries', 'T000014', 'Injuries, Abdominal'],
  ['Abdominal Injuries', 'T000014', 'Abdominal Injury'],
  ['Abdominal Injuries', 'T000014', 'Injury, Abdominal']],
 'D000008': [['Abdominal Neoplasms', 'T000016', 'Abdominal Neoplasms'],
  ['Abdominal Neoplasms', 'T000016', 'Abdominal Neoplasm'],
  ['Abdominal Neoplasms', 'T000016', 'Neoplasm, Abdominal'],
  ['Abdominal Neoplasms', 'T000016', 'Neoplasms, Abdominal']],
 'D000012': [['Abetalipoproteinemia', 'T000024', 'Abetalipoproteinemia'],
  ['Abetalipoproteinemia',
   'T365903',
   'Microsomal Triglyceride Transfer Protein Deficiency Disease'],
  ['Abetalipoproteinemia', 'T811395', 'Acanthocytosis'],
  ['Abetalipoproteinemia', 'T811395', 'Acan

In [482]:
[[item.ui, item.name, item.concepts, item.tree_numbers] for item in curr if any(s for s in item.tree_numbers if any(xs in s for xs in treetops))]

[]

In [483]:
#elem.get('ConceptPreferredTermYN', None)
if any(s for s in str(item.tree_numbers or 'EMPTY') if any(xs in s for xs in treetops))
[[item.ui, item.name, item.concepts, str(item.tree_numbers or 'EMPTY')] for item in curr ]

[['D000004',
  'Abbreviations as Topic',
  [Concept(ui='M0000004', name='Abbreviations as Topic', is_preferred=True, umls_ui=None, casn1_name=None, registry_num=None, scope_note='Works about shortened forms of written words or phrases used for brevity.\n    ', sem_types=None, terms=[Term(term_ui='T698652', string='Abbreviations as Topic', is_concept_preferred=True, is_record_preferred=True, is_permuted=False, lexical_tag='NON', date_created=datetime.date(2007, 5, 31), thesaurus_list=['NLM (2008)'])]),
   Concept(ui='M0511063', name='Acronyms as Topic', is_preferred=False, umls_ui=None, casn1_name=None, registry_num=None, scope_note=None, sem_types=None, terms=[Term(term_ui='T701041', string='Acronyms as Topic', is_concept_preferred=True, is_record_preferred=False, is_permuted=False, lexical_tag='NON', date_created=datetime.date(2007, 6, 29), thesaurus_list=['NLM (2008)'])])],
  "['L01.559.598.400.556.131']"]]

In [419]:
import csv
import os

data_dir = 'C:\\Users\\Richard.Geoghegan\\ML\\textmining\\dim_indication'
data_file='mesh_terms'
data_file = os.path.join(data_dir, data_file+'.csv')
data_file

with open(data_file, 'w', newline='') as f:
    wr = csv.writer(f, delimiter ='|')
    wr.writerows(res)

In [381]:
treetops = ['F','D']
matching = [s for s in l if any(xs in s for xs in matchers)]
any(s for s in l if any(xs in s for xs in matchers))

True

In [406]:
l=['D02.033.100.624.698.025', 'D05.033.755.624.698.025', 'D05.092.063.624.698.025']
[s.split('.')[0] for s in l]

['D02', 'D05', 'D05']

In [136]:
D002292[0]

DescriptorRecord(ui='D002292', name='Carcinoma, Renal Cell', date_created=datetime.date(1984, 5, 29), date_revised=datetime.date(2021, 6, 30), pharm_actions=None, tree_numbers=['C04.557.470.200.025.390', 'C04.588.945.947.535.160', 'C12.050.351.937.820.535.160', 'C12.050.351.968.419.473.160', 'C12.200.758.820.750.160', 'C12.200.777.419.473.160', 'C12.900.820.535.160', 'C12.950.419.473.160', 'C12.950.983.535.160'], concepts=[Concept(ui='M0003442', name='Carcinoma, Renal Cell', is_preferred=True, umls_ui=None, casn1_name=None, registry_num=None, scope_note='A heterogeneous group of sporadic or hereditary carcinoma derived from cells of the KIDNEYS. There are several subtypes including the clear cells, the papillary, the chromophobe, the collecting duct, the spindle cells (sarcomatoid), or mixed cell-type carcinoma.\n    ', sem_types=None, terms=[Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=False, lexical_tag='NON', date_created=datetim

In [144]:
dir(D002292[0].concepts)
D002292[0].tree_numbers

['C04.557.470.200.025.390',
 'C04.588.945.947.535.160',
 'C12.050.351.937.820.535.160',
 'C12.050.351.968.419.473.160',
 'C12.200.758.820.750.160',
 'C12.200.777.419.473.160',
 'C12.900.820.535.160',
 'C12.950.419.473.160',
 'C12.950.983.535.160']

In [139]:
D002292[0].concepts


[Concept(ui='M0003442', name='Carcinoma, Renal Cell', is_preferred=True, umls_ui=None, casn1_name=None, registry_num=None, scope_note='A heterogeneous group of sporadic or hereditary carcinoma derived from cells of the KIDNEYS. There are several subtypes including the clear cells, the papillary, the chromophobe, the collecting duct, the spindle cells (sarcomatoid), or mixed cell-type carcinoma.\n    ', sem_types=None, terms=[Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=False, lexical_tag='NON', date_created=datetime.date(1999, 1, 1), thesaurus_list=['NLM (1985)']), Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=True, lexical_tag='NON', date_created=None, thesaurus_list=None), Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=True, lexical_tag='NON', date_created=None, thesaurus_list=None), Term(term_ui=None, string=None, is_concept_p

In [137]:
[item for item in [item.terms for item in D002292[0].concepts]]

[[Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=False, lexical_tag='NON', date_created=datetime.date(1999, 1, 1), thesaurus_list=['NLM (1985)']),
  Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=True, lexical_tag='NON', date_created=None, thesaurus_list=None),
  Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=True, lexical_tag='NON', date_created=None, thesaurus_list=None),
  Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=False, lexical_tag='NON', date_created=datetime.date(1984, 5, 29), thesaurus_list=['UNK (19XX)']),
  Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted=True, lexical_tag='NON', date_created=None, thesaurus_list=None),
  Term(term_ui=None, string=None, is_concept_preferred=False, is_record_preferred=False, is_permuted

In [20]:
l=['aa','bb','cc']
a,b,c=l
print(a)

aa


In [23]:
l = [[item.ui, item.name, item.concepts, item.tree_numbers or 'EMPTY'] for item in curr ]
print('describe l: {}, {}'.format(type(l), len(l)))

describe l: <class 'list'>, 1


In [25]:
l[0]

['D000001',
 'Calcimycin',
 [Concept(ui='M0000001', name='Calcimycin', is_preferred=True, umls_ui=None, casn1_name='4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))-', registry_num='37H9VM9WZL', scope_note='An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.\n    ', sem_types=None, terms=[Term(term_ui='T000002', string='Calcimycin', is_concept_preferred=True, is_record_preferred=True, is_permuted=False, lexical_tag='NON', date_created=datetime.date(1999, 1, 1), thesaurus_list=['FDA SRS (2014)', 'NLM (1975)'])]),
  Concept(ui='M0353609', nam

In [52]:
tree=['C04.557.465.625.600.380.080.335',
   'C04.557.470.670.380.080.335',
   'C04.557.580.625.600.380.080.335']

list(set(item.split('.')[0] for item in tree))

['C04']

In [45]:
tree[0]

'C04.557.465.625.600.380.080.335'