Purpose:
* Getting basic info about pubmed XML structure.
* Testing scripts for parsing XML files from pubmed.

In [5]:
# Set working directory
from pathlib import Path 
wd = Path('/mnt/d/project/plant_sci_history/1_obtaining_corpus/')

## Traget tags

For each article:
* `ISOAbbreviation` for journal name
* `ArticleTitle`
* `AbstractText`
* `PubMedPubDate PubStatus="pubmed"`: to flag the start of date info
  * `Year`
  * `Month`
  * `Day`
* `ArticleId IdType="pubmed"`: this holds PMID

In [6]:
AR = "PubmedArticle"    # new article
TI = "ArticleTitle"     # title begin tag
AB = "AbstractText"
JO = "ISOAbbreviation"
DA = "PubMedPubDate PubStatus=\"pubmed\""
DAe= "/PubMedPubDate"   # Note that other PubStatus also has the
                        # same end tag.
YR = "Year"
MO = "Month"
DY = "Day"
PM = "ArticleId IdType=\"pubmed\"" 

In [7]:
# Rid of the tags
def get_value(L):
    L = L.split(">")[1]
    L = L.split("</")[0]
    return L

In [8]:
# {pmid:{"TI":title,
#        "DA":["yyyy","mm","dd"],
#        "AB":abstract",
#        "JO":journal_name}
pubmed_d = {}

# read file line by line
test_xml  = "test.xml"
input_obj = open(wd / "_test" / test_xml, "r")
L         = input_obj.readline()
c         = 0
fields    = ["","","","","",""] # [TI, AB, JO, YR, MO, DY] 

# whether DA_b flag is found or not before encoutering an DA end tag.
flag_DA   = 0
PMID      = ""
while L != "":
    L = L.strip()
    if L.startswith(f"<{AR}>") != 0:
        # This is set to make sure PMID tag is found for this article
        # and values are stored in the dictionary.
        fields   = ["","","","","",""]
        PMID = ""
        #print("\n",L)
        c+= 1
    elif L.startswith(f"<{TI}") != 0:
        fields[0] = get_value(L)
    elif L.startswith(f"<{AB}") != 0:
        fields[1] = get_value(L) 
    elif L.startswith(f"<{JO}") != 0:
        fields[2] = get_value(L)
    elif L.startswith(f"<{DA}") != 0:
        flag_DA = 1
    elif L.startswith(f"<{YR}") != 0 and flag_DA == 1:
        fields[3] = get_value(L)
    elif L.startswith(f"<{MO}") != 0 and flag_DA == 1:
        fields[4] = get_value(L)
    elif L.startswith(f"<{DY}") != 0 and flag_DA == 1:
        fields[5] = get_value(L)
    # Encouter Date end tag when a corresponding begin tag exists
    elif L.startswith(f"<{DAe}") != 0 and flag_DA == 1:
        flag_DA = 0
    elif L.startswith(f"<{PM}") != 0:
        PMID = get_value(L)
        if PMID not in pubmed_d:
            pubmed_d[PMID] = fields
        else:
            print("ERR: redundant PMID,", PMID)
        #print("PMID:",PMID)
        #print(fields)
    L = input_obj.readline()
           
print("# articles:",c)

# articles: 5


## Test filtering to get plant science articles

Assumptions about a plant science article:
* Journal name contains any of the following:
  * plant, plants, botany, botanical
* Or title/abstract mentioned any plant taxonomic terms all the way to the genus level. 

### Taxonomy name file
names.dmp
* tax_id: id of node
* name_txt: name itself
* unique name: unique variant of this name
* name class: synonym, common name etc.

In [16]:
# The following function:
# 1. Gets the tax_id of Viridiplantae.
# 2. Creates a dictionary name_dic with:
#   {tax_id:{name_class:[names]}
def get_name_dict(names_dmp_file):
    target    = "Viridiplantae"
    target_id = ""
    names_dmp = open(wd / "taxonomy" / names_dmp_file)
    L         = names_dmp.readline()
    names_dic = {}
    while L != "":
        L = L.strip().split("\t")
        tax_id = L[0]
        name   = L[2]
        name_c = L[6]
        if L[2] == "Viridiplantae":
            print("Viridiplantae tax_id:",tax_id)
            target_id = tax_id

        if tax_id not in names_dic:
            names_dic[tax_id] = {name_c:[name]}
        elif name_c not in names_dic[tax_id]:
            names_dic[tax_id][name_c] = [name]
        else:
            names_dic[tax_id][name_c].append(name)
        L = names_dmp.readline()
    return target_id, names_dic

In [18]:
target_id, names_dic = get_name_dict("test_name.dmp")

Viridiplantae tax_id: 33090


In [19]:
for i in names_dic:
    print("\n",i)
    print(names_dic[i])


 33083
{'blast name': ['cellular slime molds'], 'scientific name': ['Dictyostelia'], 'synonym': ['Dictyostelida', 'Dictyostelids', 'Dictyosteliida'], 'genbank common name': ['dictyostelid cellular slime molds']}

 33084
{'authority': ['Entamoebidae Chatton 1925'], 'scientific name': ['Entamoebidae'], 'synonym': ['Entamoebida']}

 33085
{'authority': ['Entamoeba invadens Rodhaim, 1934'], 'scientific name': ['Entamoeba invadens']}

 33090
{'authority': ['Chlorobionta Jeffrey, 1982', 'Chloroplastida Adl et al. 2005', 'Viridiplantae Cavalier-Smith, 1981'], 'synonym': ['Chlorobionta', 'Chloroplastida'], 'equivalent name': ['Chlorophyta/Embryophyta group', 'chlorophyte/embryophyte group'], 'blast name': ['green plants'], 'common name': ['green plants'], 'scientific name': ['Viridiplantae']}

 33091
{'authority': ['Raphidonema Lagerheim, 1892'], 'scientific name': ['Raphidonema']}

 33092
{'scientific name': ['Koliella longiseta'], 'authority': ['Koliella longiseta (Vischer) Hindak, 1963', '

### Taxonomy node file

nodes.dmp file consists of taxonomy nodes. The description for each node includes the following columns:
* tax_id -- node id in GenBank taxonomy database
* parent tax_id -- parent node id in GenBank taxonomy database
* rank -- rank of this node (superkingdom, kingdom, ...)
* embl code -- locus-name prefix; not unique
* division id -- see division.dmp file
* inherited div flag (1 or 0) -- 1 if node inherits division from parent
* genetic code id -- see gencode.dmp file
* inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
* mitochondrial genetic code id -- see gencode.dmp file
* inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
* GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
* hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
* comments-- free-text comments and citations

In [45]:
nodes_dmp_file = "nodes.dmp"

nodes_dmp   = open(wd / "taxonomy" / nodes_dmp_file)
L           = nodes_dmp.readline()
rank_d      = {}
paren_child = {}
child_paren = {}
while L != "":
    L = L.strip().split("\t")
    tax_id = L[0]
    par_id = L[2]
    rank   = L[4]
    if rank not in rank_d:
        rank_d[rank] = 1
    else:
        rank_d[rank]+= 1
    
    # Don't want any species or taxon with no rank
    if rank not in ["no rank", "species"]:
        if par_id not in paren_child:
            paren_child[par_id] = [tax_id]
        else:
            paren_child[par_id].append(tax_id)
        if tax_id not in child_paren:
            child_paren[tax_id] = par_id
        else:
            print(f"ERR: {tax_id} with >1 parents",
                  child_paren[tax_id], par_id)
        
    L = nodes_dmp.readline()

In [46]:
c = 0
for p in paren_child:
    print(p, paren_child[p])
    c += 1
    if c == 10:
        break

131567 ['2', '2157', '2759']
335928 ['6', '99', '279', '152053', '204476', '556257']
1706371 ['10', '2425', '62100', '316625', '359337', '447467', '745153', '940550', '1050833', '1122283', '1122284', '1458243', '1792291', '1930860', '2036021', '2038244', '2752545']
203488 ['13']
32011 ['16', '404', '81682', '359407', '1679002', '2137425', '2676063', '2872156']
213421 ['18', '890', '37817', '57039']
76892 ['20', '75', '41275', '76890', '267929', '2052989', '2768684']
267890 ['22', '1573826', '2547964']
28221 ['29', '69541', '213113', '213115', '213118', '213462', '453227', '1779134', '2597219', '2814222', '2819262', '2844874', '2844876']
80811 ['31', '39', '1524213', '1524215']


In [64]:
def get_children(p, paren_child, child_list):
    print(p)
    if p in paren_child:
        # Append an empty element for debugging purpose
        c.append("")
        c = paren_child[p]
        print("",c)
        
        child_list.extend(c)
        for a_c in c:
            get_children(a_c, paren_child, child_list)
    else:
        print(" NO CHILD")
    return child_list

In [65]:
test_id1 = "980083" # Camelineae
paren_child[test_id1]

['3701',
 '3718',
 '71323',
 '98022',
 '98032',
 '106772',
 '165360',
 '264404',
 '359815']

In [66]:
child_list = get_children(test_id1, paren_child, [])

980083
 ['3701', '3718', '71323', '98022', '98032', '106772', '165360', '264404', '359815']
3701
 NO CHILD
3718
 NO CHILD
71323
 NO CHILD
98022
 NO CHILD
98032
 NO CHILD
106772
 NO CHILD
165360
 NO CHILD
264404
 NO CHILD
359815
 NO CHILD

 NO CHILD


In [67]:
child_list

['3701',
 '3718',
 '71323',
 '98022',
 '98032',
 '106772',
 '165360',
 '264404',
 '359815',
 '']

In [68]:
test_id2 = "3700" # Brassicaceae
paren_child[test_id2]

['947472',
 '947473',
 '947474',
 '947475',
 '947476',
 '947477',
 '947488',
 '947489',
 '947490',
 '947491',
 '947492',
 '980082',
 '980083',
 '980085',
 '980086',
 '980115',
 '980193',
 '981066',
 '981067',
 '981068',
 '981069',
 '981070',
 '981071',
 '981096',
 '981097',
 '981098',
 '981099',
 '981100',
 '981105',
 '981107',
 '981119',
 '981120',
 '981121',
 '1248466',
 '1394035',
 '1394062',
 '1394172',
 '1394177',
 '1394190',
 '1394316',
 '1394318',
 '1394505',
 '1394509',
 '1394514',
 '1394515',
 '1394713',
 '1394714',
 '1394715',
 '1394716',
 '1394717',
 '1394721',
 '1520941',
 '1920184']

In [69]:
child_list2 = get_children(test_id2, paren_child, [])

3700
 ['947472', '947473', '947474', '947475', '947476', '947477', '947488', '947489', '947490', '947491', '947492', '980082', '980083', '980085', '980086', '980115', '980193', '981066', '981067', '981068', '981069', '981070', '981071', '981096', '981097', '981098', '981099', '981100', '981105', '981107', '981119', '981120', '981121', '1248466', '1394035', '1394062', '1394172', '1394177', '1394190', '1394316', '1394318', '1394505', '1394509', '1394514', '1394515', '1394713', '1394714', '1394715', '1394716', '1394717', '1394721', '1520941', '1920184']
947472
 ['264421', '282586', '282598', '2742424']
264421
 NO CHILD
282586
 NO CHILD
282598
 NO CHILD
2742424
 NO CHILD

 NO CHILD
947473
 NO CHILD
947474
 ['358660']
358660
 NO CHILD

 NO CHILD
947475
 ['3723', '359895', '368980', '369005', '369021', '369049', '369053', '682715', '743981', '1394143']
3723
 NO CHILD
359895
 NO CHILD
368980
 NO CHILD
369005
 NO CHILD
369021
 NO CHILD
369049
 NO CHILD
369053
 NO CHILD
682715
 NO CHILD
743981


In [70]:
child_list2

['947472',
 '947473',
 '947474',
 '947475',
 '947476',
 '947477',
 '947488',
 '947489',
 '947490',
 '947491',
 '947492',
 '980082',
 '980083',
 '980085',
 '980086',
 '980115',
 '980193',
 '981066',
 '981067',
 '981068',
 '981069',
 '981070',
 '981071',
 '981096',
 '981097',
 '981098',
 '981099',
 '981100',
 '981105',
 '981107',
 '981119',
 '981120',
 '981121',
 '1248466',
 '1394035',
 '1394062',
 '1394172',
 '1394177',
 '1394190',
 '1394316',
 '1394318',
 '1394505',
 '1394509',
 '1394514',
 '1394515',
 '1394713',
 '1394714',
 '1394715',
 '1394716',
 '1394717',
 '1394721',
 '1520941',
 '1920184',
 '',
 '264421',
 '282586',
 '282598',
 '2742424',
 '',
 '358660',
 '',
 '3723',
 '359895',
 '368980',
 '369005',
 '369021',
 '369049',
 '369053',
 '682715',
 '743981',
 '1394143',
 '',
 '238894',
 '281479',
 '359839',
 '537171',
 '537177',
 '743986',
 '',
 '97987',
 '97997',
 '98018',
 '253185',
 '264430',
 '282601',
 '345536',
 '358675',
 '359830',
 '359837',
 '359882',
 '359887',
 '359891',
 

## Continue development

* For the taxonomy keywords: script_get_plant_taxa.py
* For getting only plant articles: script_parse_pubmed.py