# __Step 7.1c Assess address without country code__

## ___Setup___

In [1]:
import pickle
from pathlib import Path

In [3]:
# Reproducibility
seed = 20220609

proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "7_countries"

step_dir = work_dir / "7_1c_assess_na"
step_dir.mkdir(parents=True, exist_ok=True)

## ___Functions & variables from step 7.1___

In [131]:
def get_location_str(AD, token_idx=-1, debug=0):
  '''Get the potential location string from AD
  Args:
    AD (str): The content in the AD field
    token_idx (int): -1, -2, or 0 (whole thing)
  Return
    location (str): the string that likely contain location info
    errflag (int): the AD info is empty and thus erroneous (1) or not (0)
  '''

  # The first element in the AD list is used (1st author)
  add_str  = AD[0]

  if debug: print("add_str:", add_str)

  # But there are 12 records where the AD field looks like:
  # ['.', '.', '.', '.', '.', '.']
  # So tokens will be "", dealt with in the if-else statement below.
  if add_str == "":
    loc = "NA"
    errflag = 1

  else:
    errflag = 0
    # Multipe authors:
    # ['From xxx, xxx, xxx, xxx, xxx (Ranade, Ganea, Razzak, and Garcia Gil)']
    # Another case:
    # [['xxx, xx, xxx, Maryland 20742 (M.H., G.F.D.).']
    if add_str[-1] == ")" or add_str[-2:] == ").":
      leftMargin = add_str.rfind(" (")
      add_str = add_str[:leftMargin]

    # Email field: Was contemplating using email for country, but some 1st
    # author emails are not the same as for the institution (see 2nd example).

    # 18930883, loc:China E-mail: suzhi1026@163.com
    # Some have ". Email:..."
    if add_str.find("E-mail:") != -1:
      tmp_str = add_str[:add_str.find("E-mail:")]
      # Use space as delimiter, then the empty space is taken care of later
      leftMargin = tmp_str.rfind(" ")
      add_str = add_str[:leftMargin]
      if debug: print("found: 'E-mail:", add_str)

    # 22016614, loc:Poland; E-Mails: agnieszka.pszczolkowska@uwm.edu.pl (A.P.); macieklojko@wp.p
    # 22072902, loc:China; E-Mail: yiruizao@163.com
    if add_str.find("; E-Mail") != -1:
      add_str = add_str[:add_str.find("; E-Mail")]
      if debug: print("found: '; E-Mail", add_str)

    # Below are more example for using @ field for parsing.
    #   case 1: 17296497 ['xxx, Denmark. blah@aki.ku.dk <blah@aki.ku.dk>']
    #   case 2: ?? ['Institut ..., France. achmustilli@libero.it']
    # The next one is weird, look like there is something not parsed properly
    #   case 3: 17632571 ['xxx, xxx, UK. ib 103@mole.bio.cam.ac.uk']
    # Also the next one, so cannot use "." as delimiter first.
    #   case 4: 18613594 ['xxx, ACT2601 Australia. rod.mahon@csiro.au']
    # Ok, some is missing space between country and email... Man...
    #   case 5: 9931476 ['PO Box 12, Rehovot 76100, Israel.cohenk@agri.huji.ac.il']
    if add_str.find("@") != -1:
      errflag = "email"

      # Find where the 1st email address is and generate a temp_str
      tmp_str = add_str[:add_str.find("@")]
      
      # Originally using space, but the 3rd example shows that it is not good.
      # But then "." is regularly used in email address. So do space delimter
      # first, then use "."
      tmp_str = tmp_str[:tmp_str.rfind(" ")] # this takes care of case 4
      leftMargin = tmp_str.rfind(".")        # this takes care of case 3
      if leftMargin == -1:                   # this takes care of case 5
        leftMargin = tmp_str.rfind(",")
      add_str = add_str[:leftMargin]
      if debug: print("found: '@", add_str)

    # ISNI code in ~7k records
    #   30263677 ['27601 Republic of Korea. ISNI: 0000 0004 1775 9398. GRID: grid.444122.5'
    if add_str.find("ISNI:") != -1:
      tmp_str = add_str[:add_str.find("ISNI:")]
      leftMargin = tmp_str.rfind(".")
      add_str = add_str[:leftMargin]

    # Strip empty space before
    add_str = add_str.strip()

    if debug: print("final add_str:",[add_str])

    # Some just have email address so after parsing, add_str is "".
    #   25548975: '. kehrig@pharmazie.uni-kiel.de.'
    if add_str == "":
      loc = "NA"
      errflag = 1
    # if tokens ends with ".", rid of it
    else:
      if add_str[-1] == ".": 
        add_str = add_str[:-1]

      # Originally splot with ", " then "." but there are edge cases like this:
      #   17444520, loc:Hsinchu,Taiwan
      # So split with ",", if it does not exist, split with " "
      if "," not in add_str:
        # Only one large token, split with space instead
        tokens = add_str.split(" ")
        try:
          loc = tokens[token_idx]
        except IndexError:
          loc = "NA"
      else:
        tokens = add_str.split(",")
        try:
          # rid of space if present
          loc = tokens[token_idx].strip()
        except IndexError:
          loc = "NA"

      # 19140172, loc:IR Iran, there are other variations. OpenStreeMap cannot
      # find these so deal with them manually.
      if loc.endswith(" Iran"):
        loc = loc[loc.find("Iran"):]

      # 19651701, loc:Taiwan ROC
      if loc.endswith(" ROC"):
        loc = loc[:loc.rfind(" ")]

      # 21299880, loc:DF- 70770-917 - Brasil
      if loc.find("- ") != -1:
        loc = loc.split("- ")[-1]

      # 1915409, loc:Stuttgart/Bundesrepublik Deutschland
      if loc.find("/") != -1:
        loc = loc.split("/")[-1]

    if debug: print(loc)

  return loc, errflag

In [54]:
# For some issues that arise
suppl_dict = {"UK":"GBR", "The Netherlands":"NLD", "Taiwan":"TWN", 
              "Republic of China":"TWN", "the Netherlands":"NLD"}

## ___After Nominatim North American run___

### Load not found dictionary pickle

In [36]:
nominatim_na_nf_file = work_dir / "country_info-nominatim_na_NF.pickle"

with open(nominatim_na_nf_file, "rb") as f:
  nominatim_na_nf = pickle.load(f)

In [132]:
c = 0
debug = 0
for pmid in nominatim_na_nf:
  # Address list
  AD = nominatim_na_nf[pmid][1]

  # Get location info
  loc, errflag = get_location_str(AD, -1, debug)
  print(f"pmid:{pmid}, loc:{loc}")



pmid:17296497, loc:Denmark
pmid:17315072, loc:Japan
pmid:17318595, loc:Germany
pmid:17327258, loc:305-8602 Japan
pmid:17328932, loc:Japan
pmid:17334790, loc:France
pmid:17336055, loc:Sweden
pmid:17337024, loc:Norway
pmid:17340141, loc:Japan
pmid:17340237, loc:D-8400 Regensburg
pmid:17340291, loc:D-1000 Berlin 12
pmid:17340323, loc:D-6900 Heidelberg 1
pmid:17340334, loc:D-5000 Koln 41
pmid:17340345, loc:D-8000 Munchen 2
pmid:17340523, loc:CSSR
pmid:17342586, loc:D-1000 Berlin 33
pmid:17342590, loc:A-1090 Wien
pmid:17343885, loc:Italy
pmid:17345262, loc:Pakistan
pmid:17345272, loc:NL-2300 RA Leiden
pmid:17345277, loc:Pakistan
pmid:17357447, loc:Ya'an Sichuan 625014
pmid:17364070, loc:Moscow
pmid:17364917, loc:Madrid 28040 Spain
pmid:17365182, loc:110016. China
pmid:17367744, loc:Poland
pmid:17371761, loc:UK
pmid:17379695, loc:739-8528 Japan
pmid:17385505, loc:Iran 31587-77871
pmid:17385506, loc:Iran 31587-77871
pmid:17385546, loc:Bulgarian Academy of Sciences
pmid:1738601, loc:Epalinges


### Edge cases

In [45]:
#pmid:17365182, loc:110016. China
nominatim_na_nf["17365182"][1]

['Research Department of Natural Medicine, Shenyang Pharmaceutical University. Shenyang, 110016. China.']

In [46]:
# Openstreemap actually return a false positive for Austria but should be Czech
nominatim_na_nf["17401829"][1]

['Chemisches Institut der Medizinischen Fakultat, Palacky Universitat, Olomouc, Tschechoslowakei.']

In [49]:
# CSSR is Czechoslovak Socialist Republic. Result in a false positive in France.
nominatim_na_nf["17401857"][1]

['Institute of Biophysics, Czechoslovak Academy of Sciences, Brno, CSSR.']

In [50]:
# Search with the last token result in NA, last second works, whole string NA
nominatim_na_nf["17488843"][1]

['Institut Molekulare Botanik, Universitat Ulm, Albert-Einstein-Allee 11, 89069 Ulm.']

In [51]:
nominatim_na_nf["17552431"][1]

["Dieteticien Nutritionniste, I'Institut Paul Lambin (Haute Ecole Leonard de Vinci). n.guggenbuhl@brutele.be"]

In [52]:
nominatim_na_nf["17632571"][1]

['Division of Virology, Department of Pathology, University of Cambridge, Tennis Court Road, Cambridge CB2 1QP, UK. ib 103@mole.bio.cam.ac.uk']

['. kehrig@pharmazie.uni-kiel.de.', '. salban@pharmazie.uni-kiel.de.']

In [69]:
nominatim_na_nf["18613594"][1]

['CSIRO Entomology, GPO Box 1700 Canberra, ACT2601 Australia. rod.mahon@csiro.au']

In [70]:
nominatim_na_nf["30263677"][1]

['1Department of Food and Fermentation, Far East University, 76-32 Daehak-gil, Gamgok, Eumseong, Chungbuk 27601 Republic of Korea. ISNI: 0000 0004 1775 9398. GRID: grid.444122.5',
 '2Department of Food and Nutrition, Hanyang University, 222 Wangsimni-ro, Seongdong-gu, Seoul, 04763 Republic of Korea. ISNI: 0000 0001 1364 9317. GRID: grid.49606.3d',
 '2Department of Food and Nutrition, Hanyang University, 222 Wangsimni-ro, Seongdong-gu, Seoul, 04763 Republic of Korea. ISNI: 0000 0001 1364 9317. GRID: grid.49606.3d',
 '2Department of Food and Nutrition, Hanyang University, 222 Wangsimni-ro, Seongdong-gu, Seoul, 04763 Republic of Korea. ISNI: 0000 0001 1364 9317. GRID: grid.49606.3d']

In [88]:
nominatim_na_nf["18627491"][1]

['Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Department of Forest Ecology and Management, Swedish University of Agricultural Sciences, SE-901 83 Umea, Sweden.']

In [100]:
# This is probably the worst possible case...
nominatim_na_nf["19704486"][1]

["The University of Manchester; Faculty of Life Sciences; Jackson's Mill; Manchester, United Kingdom; Aarhus University; Department of Biological Sciences; Aarhus, Denmark; The University of Alberta; Alberta, Edmonton Canada; AgroParisTech; ECOFOG; UMR CIRAD-CNRS-ENGREF-INRA-UAG; French Guiana, France; AgroParisTech; LERFOB; UMR ENGREF INRA 1092; Ecole Nationale du Genie Rural; France."]

In [101]:
nominatim_na_nf["22016614"][1]

['Department of Diagnostics and Plant Pathophysiology, University of Warmia and Mazury, Plac Lodzki 5, 10-957, Olsztyn, Poland; E-Mails: agnieszka.pszczolkowska@uwm.edu.pl (A.P.); macieklojko@wp.pl (M. L.).']

In [106]:
nominatim_na_nf["9931476"][1]

['Otto-Warburg-Center for Agricultural Biotechnology, The Hebrew University of Jerusalem, Faculty of Agriculture, PO Box 12, Rehovot 76100, Israel.cohenk@agri.huji.ac.il']

In [111]:
nominatim_na_nf["21339975"][1]

['Biotechnology Research Centre, Malaysian Agricultural Research and Development Institute, P.O Box 12301, General Post Office, 50774 Kuala Lumpur, Malaysia; E-Mails: hamidunb@yahoo.com (H.B.); rohaiza@mardi.gov.my (R.A.R.); indu@mardi.gov.my (I.B.S.J.).']

In [134]:
nominatim_na_nf["20184044"][1]

['School of Biosciences and Technology, Vellore Institute of Technology (VIT) University, Vellore-632 014, (T.N.) India. deepakganjawala73@yahoo.com']

In [135]:
#22303203, loc:Facsimile: (301) 838 0208
nominatim_na_nf["22303203"][1]

['The Institute for Genomic Research, 9712 Medical Center Drive, Rockville MD 20850, , Facsimile: (301) 838 0208, rbuell@tigr.org.']