# __Step 7: Get country info__

Goal
- Get country info out of each doc
- Get # of docs per country
- Get # of docs per continent
- Get # of docs per country over time
- Get # of docs per country per topic
- Get # of docs per country per topic over time

Issues:
- 2/20/23: 
  - The corpus dataset from 2_5_predict_pubmed does not have author or affiliation info. This needs to be done from the very beginning when I process the pubmed records.
  - In 
[MEDLINE/PubMed Data Element (Field) Descriptions](https://www.nlm.nih.gov/bsd/mms/medlineelements.html), there are several important info:
    - The affiliation of the authors, corporate authors and investigators appear in this repeating field.
      - 1988- The address of the first author's affiliation is included. The institution, city, and state including zip code for U.S. addresses, and country for countries outside of the United States, are included if provided in the journal; sometimes the street address is also included if provided in the journal.
      - 1995-2013 The designation USA is added at the end of the address when the first author's affiliation is in the fifty United States or the District of Columbia.
        - Q: Does this mean that this is not done for records before 1995?
      - 1996- The primary author's electronic mail (e-mail) address is included at the end of the Affiliation field, if present in the journal.
      - 2003- The complete first author address is entered as it appears in the article with no words omitted.
      - October 2013- Quality control of this field ceased in order to accommodate the affiliations for all authors and contributors.
      - December 2014- Multiple affiliations for each author or contributor are included.
        - __Because of this, only 1st author info is considered.__
  - For dealing with countries, there is the issue of historical country names, see [ISSO_3166-3](https://en.wikipedia.org/wiki/ISO_3166-3)

## ___Set up___

### Module import

In [147]:
import pickle, nltk, re, multiprocessing, pycountry, zipcodes, us
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from Bio import Entrez, Medline
from time import sleep
from uszipcode import SearchEngine

'''
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix, dok_matrix
from time import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict, Counter
from bisect import bisect
from mlxtend.preprocessing import minmax_scaling
from copy import deepcopy
'''

'\nfrom scipy.sparse import csr_matrix, lil_matrix, coo_matrix, dok_matrix\nfrom time import time\nfrom datetime import datetime\nfrom dateutil.relativedelta import relativedelta\nfrom collections import OrderedDict, Counter\nfrom bisect import bisect\nfrom mlxtend.preprocessing import minmax_scaling\nfrom copy import deepcopy\n'

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "7_countries"
work_dir.mkdir(parents=True, exist_ok=True)

# plant science corpus with date and other info
dir2        = proj_dir / "2_text_classify//2_5_predict_pubmed"
corpus_file = dir2 / "corpus_plant_421658.tsv.gz"

# timestamp bins
dir44            = proj_dir / "4_topic_model/4_4_over_time"
ts_for_bins_file = dir44 / "table4_4_bin_timestamp_date.tsv"

# So PDF is saved in a format properly
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams["font.family"] = "sans-serif"

## ___Get PubMed records___

### Read plant science corpus

In [None]:
corpus = pd.read_csv(corpus_file, compression='gzip', sep='\t')

In [None]:
corpus.head(2)

Unnamed: 0.1,Unnamed: 0,PMID,Date,Journal,Title,Abstract,QualifiedName,txt,reg_article,y_prob,y_pred
0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,1
1,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,1


In [6]:
# Get all PMIDs
pmids = corpus.PMID.values
pmids.shape

(421658,)

### Get Pubmed docs using PMIDs


In [30]:
#https://stackoverflow.com/questions/59267992/biopython-how-to-download-all-of-the-peptide-sequences-or-all-records-associat

Entrez.email = 'shius@msu.edu'

id_list  = [str(pmid) for pmid in pmids]
post_xml = Entrez.epost(db='pubmed', id=','.join(id_list))
results  = Entrez.read(post_xml)
webenv   = results['WebEnv']
qkey     = results['QueryKey']

In [36]:
#http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec166

step    = 10000
for begin in tqdm(range(0, len(pmids), step)):
  # first check if this file is present
  medline_file = work_dir / f"corpus_plant_421658_medline_{begin}.pickle"
  if medline_file.is_file():
    continue

  # file does not exist
  subset   = pmids[begin:begin+step]

  # Get Medline records for subset
  handle  = Entrez.efetch(db='pubmed', id=subset, rettype='medline', 
                          retmode='text', webenv=webenv, query_key=qkey)
  records  = Medline.parse(handle)
  rec_list = list(records)

  with open(medline_file, "wb") as f:
    pickle.dump(rec_list, f)


100%|██████████| 43/43 [10:20<00:00, 14.44s/it] 


### Process PubMed Medline docs

In [54]:
# Read individuial pickle files and compile the full list
all_rec = []
for begin in tqdm(range(0, len(pmids), step)):
  medline_file = work_dir / f"corpus_plant_421658_medline_{begin}.pickle"
  with open(medline_file, "rb") as f:
    rec_list = pickle.load(f)
  all_rec.extend(rec_list)

# The number of docs don't add up. Some records are not downloaded
len(all_rec)

100%|██████████| 43/43 [00:36<00:00,  1.17it/s]


421585

### Check what's missing

In [55]:
# Go thorugh all downloaded docs and get PMIDs
def check_missing(pmids, all_rec):
  '''
  Args:
    pmids (list): list of integer PMIDs
    all_rec (list): list of dictionary of medline records
  Return:
    id_list_missed (list): list of items in pmids but not all_rec
  '''

  # Downloaded
  pmids_dn = []
  for rec in tqdm(all_rec):
    pmids_dn.append(int(rec['PMID']))
  
  # Compare lists
  #https://stackoverflow.com/questions/15455737/python-use-set-to-find-the-different-items-in-list
  print("differnce:",len(pmids)-len(pmids_dn))

  pmids_ori_set = set(pmids)
  pmids_dn_set  = set(pmids_dn)
  missing = pmids_ori_set - pmids_dn_set
  print("# missing:", len(missing))

  id_list_missed = [str(pmid) for pmid in missing]

  return id_list_missed

In [57]:
# Get the missing records and add to all_rec
id_list_missed = check_missing(pmids, all_rec)

# Get Medline records for subset straight without epost
handle  = Entrez.efetch(db='pubmed', id=id_list_missed, rettype='medline', 
                        retmode='text')
records  = Medline.parse(handle)
rec_list = list(records)

# Can only get 41, so some still missing
print("Retrieved:", len(rec_list))

100%|██████████| 421585/421585 [00:00<00:00, 1032346.45it/s]


differnce: 73
# missing: 72
Retrieved: 41


In [61]:
# Save the missing records as pickle
medline_file = work_dir / "corpus_plant_421658_medline_missed.pickle"

with open(medline_file, "wb") as f:
  pickle.dump(rec_list, f)

In [58]:
# Add to all_rec, then check again
all_rec.extend(rec_list)

100%|██████████| 421626/421626 [00:00<00:00, 1136620.95it/s]


differnce: 32
# missing: 31


['33292416',
 '33098629',
 '33082138',
 '31186333',
 '33292726',
 '32545091',
 '25764421',
 '25764422',
 '25764423',
 '25764424',
 '25764425',
 '25764426',
 '25764427',
 '25764428',
 '25764429',
 '32333389',
 '25764431',
 '25764432',
 '25764436',
 '25764437',
 '25764439',
 '33067356',
 '32764131',
 '33380717',
 '33380719',
 '25830899',
 '33380725',
 '33380726',
 '33380728',
 '17342585',
 '33064827']

In [60]:
still_missing = check_missing(pmids, all_rec)
len(still_missing)

100%|██████████| 421626/421626 [00:00<00:00, 1164415.95it/s]


differnce: 32
# missing: 31


31

## ___Search for country codes___

### Set up country dictionaries

In [133]:
# Build {country_name or official_name: alpha_3 code}
countries   = list(pycountry.countries)
cname_to_a3 = {}

for country in countries:
  name_a2    = country.alpha_2
  name_a3    = country.alpha_3
  name_short = country.name

  cname_to_a3[name_a2] = name_a3 # store this for situation like US
  cname_to_a3[name_a3] = name_a3 # store this for sitiation like USA
  cname_to_a3[name_short] = name_a3
  
  # put official name in
  try:
    name_offic = country.official_name
    cname_to_a3[name_offic] = name_a3
  except AttributeError:
    #print("No official name:", name_short)
    name_offic = "NA"

In [138]:
# Also build a dictionary for historical countries
countries_hist = list(pycountry.historic_countries)
cname_hist_to_a3 = {}

for country in countries_hist:

  # the name in historical countries are the official names
  name_offic = country.name
  cname_hist_to_a3[name_offic] = name_a3
  
  name_short = name_offic.split(",")[0]
  cname_hist_to_a3[name_short] = name_a3


In [159]:
# For weird stuff
suppl_dict = {"Academia Sinica":"China", "UK":"GBR", "The Netherlands":"NLD"}

### Search for country info

In [164]:
# Without country a3
# Before checking for US state: 24867

country_info = {} # {pmid:[first_AU, first_AD, alpha_3]}

count = 0
for rec in tqdm(all_rec):
  pmid = rec["PMID"]
  a3   = "NA" # set default value

  # Deal with AU info
  try:
    AU = rec["AU"]
  except KeyError:
    AU = "NA"  

  # Deal with AD info
  try:
    AD      = rec["AD"]
    tokens  = AD[0][:-1].split(", ")

    # The last token contain email address, not country
    if len(tokens) < 2:
      #print(tokens)
      pass
    elif tokens[-1].find("@") != -1:
      country = tokens[-2]
    else:
      country = tokens[-1]

    # in current contru name
    if country in cname_to_a3:
      a3 = cname_to_a3[country]

    # historical country name
    elif country in cname_hist_to_a3:
      a3 = cname_hist_to_a3[country]
    # manually defined
    elif country in suppl_dict:
      a3 = cname_to_a3[suppl_dict[country]]
    elif len(country.split(" ")) == 2:
      # Deal with those with "city zip_code" by checking if it is a US zip code
      # e.g., Columbus 43210
      #https://uszipcode.readthedocs.io/
      [state, zip_code] = country.split(" ")
      if zip_code.find("-") != -1:
        zip_code = zip_code.split("-")[0]

      # check state
      if us.states.lookup(state) is not None: 
        a3 = "USA"
      # check zip
      elif zip_code.isdigit():
        # With this, somehow 02167 is not found, actually, go to USPS, this is
        # not found either
        sr = SearchEngine()
        z = sr.by_zipcode(zip_code)
        if z is not None:
          a3 = "USA"
      else:
        # UNKNOWN THAT NEED TO BE DEALT WITH
        #print(country)
        #break
        pass
    else:
      # UNKNOWN THAT NEED TO BE DEALT WITH
      pass

  except KeyError:
    AD = "NA"

  #if a3 == "NA" and AD != "NA":
  #  print(AD)
  #  break
  if a3 == "NA": count += 1
  country_info[pmid] = [AU, AD, a3]

print("No country code:", count)

100%|██████████| 421626/421626 [00:53<00:00, 7907.39it/s] 

No country code: 137886





## ___Put country code info into corpusm___

### Get continent info

In [None]:
#https://stackoverflow.com/questions/55910004/get-continent-name-from-country-using-pycountry



## ___Test___

In [73]:
list(pycountry.countries)[0]

Country(alpha_2='AW', alpha_3='ABW', flag='🇦🇼', name='Aruba', numeric='533')

In [85]:
rec = all_rec[4004]
au = rec["AU"]
ad = rec["AD"]
au, ad

(['Fouly HM', 'Domier LL', "D'Arcy CJ"],
 ['Department of Plant Pathology, University of Illinois, Urbana 61801.'])

In [87]:
pycountry.subdivisions.lookup("Urbana")

LookupError: Could not find a record for 'urbana'

In [131]:
country = pycountry.countries.get(name="Yugoslavia")
print(country)

None


In [136]:
country = pycountry.historic_countries.get(name="Yugoslavia")
print(country)

None


In [137]:
list(pycountry.historic_countries)

[Country(alpha_2='AI', alpha_3='AFI', alpha_4='AIDJ', name='French Afars and Issas', numeric='262', withdrawal_date='1977'),
 Country(alpha_2='AN', alpha_3='ANT', alpha_4='ANHH', name='Netherlands Antilles', numeric='530', withdrawal_date='1993-07-12'),
 Country(alpha_2='BQ', alpha_3='ATB', alpha_4='BQAQ', name='British Antarctic Territory', withdrawal_date='1979'),
 Country(alpha_2='BU', alpha_3='BUR', alpha_4='BUMM', name='Burma, Socialist Republic of the Union of', numeric='104', withdrawal_date='1989-12-05'),
 Country(alpha_2='BY', alpha_3='BYS', alpha_4='BYAA', name='Byelorussian SSR Soviet Socialist Republic', numeric='112', withdrawal_date='1992-06-15'),
 Country(alpha_2='CS', alpha_3='CSK', alpha_4='CSHH', name='Czechoslovakia, Czechoslovak Socialist Republic', numeric='200', withdrawal_date='1993-06-15'),
 Country(alpha_2='CS', alpha_3='SCG', alpha_4='CSXX', name='Serbia and Montenegro', numeric='891', withdrawal_date='2006-06-05'),
 Country(alpha_2='CT', alpha_3='CTE', alpha_

In [141]:
sr = SearchEngine()
z = sr.by_zipcode("02167")
print(z)

None


In [146]:
exact_zip = zipcodes.matching('02167')
print(exact_zip)

[]


In [150]:
print(us.states.lookup('bleh'))

None
