# __Step 7.1c Assess address without country code__

## ___Setup___

In [86]:
import pickle
from pathlib import Path

In [87]:
# Reproducibility
seed = 20220609

proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "7_countries/7_1_parse_countries"

step_dir = work_dir / "7_1c_assess_na"
step_dir.mkdir(parents=True, exist_ok=True)

## ___Functions & variables from step 7.1___

### Get location string

In [88]:
def get_location_str(add_str, token_idx=-1, debug=0):
  '''Get the potential location string from AD
  Args:
    add_str (str): The content in the 1st AD element (1st author address)
    token_idx (int): -1, -2, or 0 (whole thing)
  Return
    location (str): the string that likely contain location info
    errflag (int): the AD info is empty and thus erroneous (1) or not (0)
  '''

  if debug: print("add_str:", add_str)

  # But there are 12 records where the AD field looks like:
  # ['.', '.', '.', '.', '.', '.']
  # So tokens will be "", dealt with in the if-else statement below.
  if add_str == "":
    loc = "NA"
    errflag = 1

  else:
    errflag = 0
    # Multipe authors:
    # ['From xxx, xxx, xxx, xxx, xxx (Ranade, Ganea, Razzak, and Garcia Gil)']
    # Another case:
    # [['xxx, xx, xxx, Maryland 20742 (M.H., G.F.D.).']
    if add_str[-1] == ")" or add_str[-2:] == ").":
      leftMargin = add_str.rfind(" (")
      add_str = add_str[:leftMargin]

    # Email field: Was contemplating using email for country, but some 1st
    # author emails are not the same as for the institution (see 2nd example).

    # 18930883, loc:China E-mail: suzhi1026@163.com
    # Some have ". Email:..."
    if add_str.find("E-mail:") != -1:
      tmp_str = add_str[:add_str.find("E-mail:")]
      # Use space as delimiter, then the empty space is taken care of later
      leftMargin = tmp_str.rfind(" ")
      add_str = add_str[:leftMargin]
      if debug: print("found: 'E-mail:", add_str)

    # 22016614, loc:Poland; E-Mails: agnieszka.pszczolkowska@uwm.edu.pl (A.P.); macieklojko@wp.p
    # 22072902, loc:China; E-Mail: yiruizao@163.com
    elif add_str.find("; E-Mail") != -1:
      add_str = add_str[:add_str.find("; E-Mail")]
      if debug: print("found: '; E-Mail", add_str)

    # 24866837:Fax: (+31) 50-3636440  <-- AG Groningen (The Netherlands), Fax:
    elif add_str.find("Fax:") != -1:
      add_str = add_str[:add_str.find("Fax:")]
      if debug: print("found: 'Fax:", add_str)

    # Below are more example for using @ field for parsing.
    #   case 1: 17296497 ['xxx, Denmark. blah@aki.ku.dk <blah@aki.ku.dk>']
    #   case 2: ?? ['Institut ..., France. achmustilli@libero.it']
    # The next one is weird, look like there is something not parsed properly
    #   case 3: 17632571 ['xxx, xxx, UK. ib 103@mole.bio.cam.ac.uk']
    # Also the next one, so cannot use "." as delimiter first.
    #   case 4: 18613594 ['xxx, ACT2601 Australia. rod.mahon@csiro.au']
    # Ok, some is missing space between country and email... Man...
    #   case 5: 9931476 ['PO Box 12, Rehovot 76100, Israel.cohenk@agri.huji.ac.il']
    if add_str.find("@") != -1:

      # Find where the 1st email address is and generate a temp_str
      tmp_str = add_str[:add_str.find("@")]
      
      # Originally using space, but the 3rd example shows that it is not good.
      # But then "." is regularly used in email address. So do space delimter
      # first, then use "."
      tmp_str = tmp_str[:tmp_str.rfind(" ")] # this takes care of case 4
      leftMargin = tmp_str.rfind(".")        # this takes care of case 3
      if leftMargin == -1:                   # this takes care of case 5
        leftMargin = tmp_str.rfind(",")
      add_str = add_str[:leftMargin]
      if debug: print("found: '@", add_str)

    # ISNI code in ~7k records
    #   30263677 ['27601 Republic of Korea. ISNI: 0000 0004 1775 9398. GRID: grid.444122.5'
    if add_str.find("ISNI:") != -1:
      tmp_str = add_str[:add_str.find("ISNI:")]
      leftMargin = tmp_str.rfind(".")
      add_str = add_str[:leftMargin]

    # Strip empty space before
    add_str = add_str.strip()

    if debug: print("final add_str:",[add_str])

    # Some just have email address so after parsing, add_str is "".
    #   25548975: '. kehrig@pharmazie.uni-kiel.de.'
    if add_str == "":
      loc = "NA"
      errflag = 1
    # if tokens ends with ".", rid of it
    else:
      if add_str[-1] == ".": 
        add_str = add_str[:-1]

      # Originally splot with ", " then "." but there are edge cases like this:
      #   17444520, loc:Hsinchu,Taiwan
      # So split with ",", if it does not exist, split with " "
      if "," not in add_str:
        # Only one large token, split with space instead
        tokens = add_str.split(" ")
        try:
          loc = tokens[token_idx]
        except IndexError:
          loc = "NA"
          errflag = 1
      else:
        tokens = add_str.split(",")
        try:
          # rid of space if present
          loc = tokens[token_idx].strip()
        except IndexError:
          loc = "NA"
          errflag = 1

      # More edge cases with "(" and some with ")", examples:
      # 18636686:47023 Cesena (FC) Italy
      # 19704524:Ibaraki Japan; xxx (B & PMP); xxx; xxx; Montpellier France
      # 21665592:B-1860 Meise (Belgium);
      # 24828308:Japan (K.S	
      

      # 19140172, loc:IR Iran, there are other variations. OpenStreeMap cannot
      # find these so deal with them manually.
      if loc.endswith(" Iran"):
        loc = loc[loc.find("Iran"):]

      # 19651701, loc:Taiwan ROC
      if loc.endswith(" ROC"):
        loc = loc[:loc.rfind(" ")]

      # 21299880, loc:DF- 70770-917 - Brasil
      if loc.find("- ") != -1:
        loc = loc.split("- ")[-1]

      # 1915409, loc:Stuttgart/Bundesrepublik Deutschland
      if loc.find("/") != -1:
        loc = loc.split("/")[-1]

    if debug: print(loc)

  return loc, errflag

### Call monimatim

In [89]:
def call_geolocator(geolocator, AD, token_idx, suppl_dict, debug=0):
  '''Subrontine for calling geolocator
  Args
    AD (list): A list of addresses for authors
    token_idx (int): define which token the location string should be obtained
      default to -1 which is typically where the broadest info (e.g., city,
      zip code) is located. If this does not work, will try -2 token, or 0
      which means the entire AD string will be used for geolocator search.
  Return:
    a3 (string): the a3 country code, if not found, return empty string.
    geo (geolocator): the object returned from the search
    err_str (string): If an exception is thrown, this is the error string
  '''
  # Get location string
  loc, errflag   = get_location_str(AD, token_idx)

  if debug: print(loc, errflag)

  # This happens when there is only one token delimited by "," but the token_idx
  # is set to -2
  if errflag == 1:
    return "", None, "Only_1_token"

  # Call Nominatim to get a response:
  err_str = "NO_ERR"
  try:
    geo  = geolocator.geocode(loc, language='en')
  except Exception as ex:
    err_str = str(ex)
    geo = None

  if geo is not None:
    country = geo.raw['display_name'].split(", ")[-1]
    a3 = get_a3(country, cname_to_a3, cname_hist_to_a3, suppl_dict)
  else:
    a3 = "" 

  return a3, geo, err_str

In [90]:
def call_nominatim(nominatim_nf, not_found, suppl_dict, dir_pmid_log, dir_out,
                   sleep_time=0.5):
  '''Search entries against Nominatim server
  Args:
    nominatim_nf (dict): {pmid:[AU,AD]}, record still not found
    not_found (dict): {pmid:[AU, AD]}, for records with no a3 code after
      pycountry search
    suppl_dict (dict): regions that do not have proper pycountry info
    dir_pmid_log (Path): path to pmid log file of completed searches
    dir_out (Path): path to geolocator outputs
    sleep_time (int): time in second between queries.
  Return:
    nominatim_nf (dict): {pmid:[AU, AD]} for pmids with NA as search results
  '''

  # Access local Nominatim server
  geolocator = Nominatim(domain=f'localhost:8080', scheme='http')

  # Found info, because timeout keep happening, decide to save results as
  # things go.
  #nominatim_out = {} # {pmid:[AU, AD, a3, geo.raw]}

  # Info not found
  nominatim_nf  = {} # {pmid:[AU, AD]}

  # Because I keep getting time out, try to track what's working so I can
  # continue what what's not.
  # Create directory for output search result files
  dir_out.mkdir(parents=True, exist_ok=True)

  # Get the last pmid with result
  if not dir_pmid_log.is_file():
    out_names     = ""
    last_out_name = ""
    print("Starting with no output yet")
  else:
    with open(dir_pmid_log, "r") as f:
      out_names = f.readline()
      out_names_list = out_names.split(' ')
      out_names_list.sort()
      last_out_name = out_names_list[-1]
      print("Started, last_out_name:", last_out_name)

  # Save the log file again
  with open(dir_pmid_log, "w") as f:
    # Write the names aleady processed
    f.write(out_names)

    # sort pmids
    pmids = list(not_found.keys())
    pmids.sort()

    # determine where to restart
    if last_out_name == "":
      starting_idx = 0
    else:
      starting_idx = pmids.index(last_out_name)+1

    pmids_remaining = pmids[starting_idx:]

    # Go through records with no a3 info  
    for pmid in tqdm(pmids_remaining):

      err_str1, err_str2, err_str3 = "", "", ""

      # Get AU and AD
      [AU, AD] = not_found[pmid]

      # Get location string based on the first author's AD field
      a3, geo, err_str1 = call_geolocator(geolocator, AD, -1, suppl_dict)

      # Not found using the last field
      if geo is None:
        # Try the last second field
        a3, geo, err_str2 = call_geolocator(geolocator, AD, -2, suppl_dict)

        # Still not found
        if geo is None:
          # Try using the whole thing
          a3, geo, err_str3 = call_geolocator(geolocator, AD, 0, suppl_dict)
          

      if geo is None:
        nominatim_nf[pmid] = [AU, AD]
        geo_file = dir_out / f"{pmid}_na.txt"
        with open(geo_file, "w") as f_geo:
          f_geo.write(f"{AU}\t{AD}\tNA\t{None}\t{err_str1},{err_str2},{err_str3}")
      else:
        # Save result instead of put it in dictionary
        #nominatim_out[pmid] = [AU, AD, a3, geo.raw]
        geo_file = dir_out / f"{pmid}.txt"
        with open(geo_file, "w") as f_geo:
          f_geo.write(f"{AU}\t{AD}\t{a3}\t{geo.raw}")

      # Write the pmid of this record into after search result is returned
      f.write(f" {pmid}")

      # To reduce possibilities of timeout
      sleep(sleep_time)

  return nominatim_nf

In [91]:
# For some issues that arise
suppl_dict = {"UK":"GBR", "The Netherlands":"NLD", "Taiwan":"TWN", 
              "Republic of China":"TWN", "the Netherlands":"NLD", 
              "I.R.Iran":"IRN"}

## ___After Nominatim North American run___

### Load country info based on pycountry

In [99]:
ci_file = work_dir / 'country_info-pycountry.pickle'
with open(ci_file, "rb") as f:
  ci = pickle.load(f)

### Load not found dictionary pickle

In [94]:
nomi_na_nf_file = work_dir / "country_info-nominatim_na_NF.pickle"

with open(nomi_na_nf_file, "rb") as f:
  nomi_na_nf = pickle.load(f)

In [95]:
c = 0
debug = 0
for pmid in nomi_na_nf:
  # Address list
  AD = nomi_na_nf[pmid][1]

  # Get location info
  loc, errflag = get_location_str(AD, -1, debug)
  print(f"pmid:{pmid}, loc:{loc}")



AttributeError: 'list' object has no attribute 'find'

### Find cases with loc = "XXX. XXX"

In [96]:
c = 0
for pmid in nomi_na_nf:
  AD = nomi_na_nf[pmid][1][0]
  loc, errflag = get_location_str(AD, token_idx=-1)
  if loc.find(".") != -1:
    print(f"{pmid}:{loc}\t{AD}")
    c+=1
print("Total:", c)

17365182:110016. China	Research Department of Natural Medicine, Shenyang Pharmaceutical University. Shenyang, 110016. China.
17518112:Lupaszigeti ut 4.-2011	Gyogynoveny Kutato Intezet Zrt., Budakalasz, Lupaszigeti ut 4.-2011.
17851392:M.P. India	School of Studies in Chemistry, Vikram University, Ujjain 456010, M.P. India. ksrao7709@rediffmail.com
18060302:Univ. Federal da Grande Dourados	Faculdade de Ciencias Agrarias, Univ. Federal da Grande Dourados. rbarrosufms@yahoo.com.br
18396817:ES-50080 Zaragoza. Spain	Centro de Investigacion y Tecnologia Agroalimentaria de Aragon, Unidad de Tecnologia en Produccion vegetal. Unidad de Sanidad Vegetal P.O. Box 727, ES-50080 Zaragoza. Spain. hchikhrouhou@aragon.es
1849009:Frankfurt F.R.G	Botanisches Institut, Johann Wolfgang Goethe Universitat, Frankfurt F.R.G.
18507776:SY23 3EB. UK	Institute of Grassland and Environmental Research (IGER) , Plas Gogerddan, Aberystwyth, Ceredigion, SY23 3EB. UK.
18510349:St. Lucia Brisbane	Cooperative Research Cen

In [97]:
AD = nomi_na_nf['27789739'][1][0]
get_location_str(AD, token_idx=-1, debug=1)

add_str: Australian Research Council Centre of Excellence in Plant Energy Biology, University of Western Australia, Crawley, Western Australia 6009, Australia (M.W.M., S.K.-J., A.I.); monika.murcha@uwa.edu.au.
found: '@ Australian Research Council Centre of Excellence in Plant Energy Biology, University of Western Australia, Crawley, Western Australia 6009, Australia (M.W.M., S.K.-J., A.I
final add_str: ['Australian Research Council Centre of Excellence in Plant Energy Biology, University of Western Australia, Crawley, Western Australia 6009, Australia (M.W.M., S.K.-J., A.I']
A.I


('A.I', 0)

### Find cases with loc = "XXX (XXX" or "XXX (XX)" or "XXX (XX) XX"

In [98]:
c = 0
for pmid in nomi_na_nf:
  AD = nomi_na_nf[pmid][1][0]
  loc, errflag = get_location_str(AD, token_idx=-1)
  if loc.find("(") != -1:
    print(f"{pmid}:{loc}\t{AD}")
    c+=1
print("Total:", c)

17552431:I'Institut Paul Lambin (Haute Ecole Leonard de Vinci)	Dieteticien Nutritionniste, I'Institut Paul Lambin (Haute Ecole Leonard de Vinci). n.guggenbuhl@brutele.be
18636686:47023 Cesena (FC) Italy	Dipartimento di Scienze degli Alimenti, Alma Mater Studiorum Universita di Bologna, Piazza Goidanich 60, 47023 Cesena (FC) Italy.
18710252:10095 Grugliasco (TO) Italy	DiVaPRA, Plant Genetics and Breeding, University of Torino, via L. da Vinci 44, 10095 Grugliasco (TO) Italy.
19704524:Ibaraki Japan; UMR de Biochimie et Physiologie Moleculaire des Plantes (B & PMP); Centre National de la Recherche Scientifique; Universite Montpellier; Montpellier France	Biology Department; Colorado State University; Environmental Biology Division; National Institute for Environmental Studies; Graduate School of Life and Environmental Sciences; University of Tsukuba; Tsukuba, Ibaraki Japan; UMR de Biochimie et Physiologie Moleculaire des Plantes (B & PMP); Centre National de la Recherche Scientifique; Univ

### Edge cases

In [77]:
#pmid:17365182, loc:110016. China
AD           = nomi_na_nf["17365182"][1][0]
loc, errflag = get_location_str(AD, token_idx=-1, debug=0)
loc, AD


('China',
 'Research Department of Natural Medicine, Shenyang Pharmaceutical University. Shenyang, 110016. China.')

In [9]:
# Openstreemap actually return a false positive for Austria but should be Czech
nomi_na_nf["17401829"][1]

['Chemisches Institut der Medizinischen Fakultat, Palacky Universitat, Olomouc, Tschechoslowakei.']

In [10]:
# CSSR is Czechoslovak Socialist Republic. Result in a false positive in France.
nomi_na_nf["17401857"][1]

['Institute of Biophysics, Czechoslovak Academy of Sciences, Brno, CSSR.']

In [11]:
# Search with the last token result in NA, last second works, whole string NA
nomi_na_nf["17488843"][1]

['Institut Molekulare Botanik, Universitat Ulm, Albert-Einstein-Allee 11, 89069 Ulm.']

In [12]:
nomi_na_nf["17552431"][1]

["Dieteticien Nutritionniste, I'Institut Paul Lambin (Haute Ecole Leonard de Vinci). n.guggenbuhl@brutele.be"]

In [13]:
nomi_na_nf["17632571"][1]

['Division of Virology, Department of Pathology, University of Cambridge, Tennis Court Road, Cambridge CB2 1QP, UK. ib 103@mole.bio.cam.ac.uk']

In [14]:
nomi_na_nf["18613594"][1]

['CSIRO Entomology, GPO Box 1700 Canberra, ACT2601 Australia. rod.mahon@csiro.au']

In [15]:
nomi_na_nf["30263677"][1]

['1Department of Food and Fermentation, Far East University, 76-32 Daehak-gil, Gamgok, Eumseong, Chungbuk 27601 Republic of Korea. ISNI: 0000 0004 1775 9398. GRID: grid.444122.5',
 '2Department of Food and Nutrition, Hanyang University, 222 Wangsimni-ro, Seongdong-gu, Seoul, 04763 Republic of Korea. ISNI: 0000 0001 1364 9317. GRID: grid.49606.3d',
 '2Department of Food and Nutrition, Hanyang University, 222 Wangsimni-ro, Seongdong-gu, Seoul, 04763 Republic of Korea. ISNI: 0000 0001 1364 9317. GRID: grid.49606.3d',
 '2Department of Food and Nutrition, Hanyang University, 222 Wangsimni-ro, Seongdong-gu, Seoul, 04763 Republic of Korea. ISNI: 0000 0001 1364 9317. GRID: grid.49606.3d']

In [16]:
nomi_na_nf["18627491"][1]

['Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Umea Plant Science Centre, Department or Forest Genetics and Plant Physiology and.',
 'Department of Forest Ecology and Management, Swedish University of Agricultural Sciences, SE-901 83 Umea, Sweden.']

In [17]:
# This is probably the worst possible case...
nomi_na_nf["19704486"][1]

["The University of Manchester; Faculty of Life Sciences; Jackson's Mill; Manchester, United Kingdom; Aarhus University; Department of Biological Sciences; Aarhus, Denmark; The University of Alberta; Alberta, Edmonton Canada; AgroParisTech; ECOFOG; UMR CIRAD-CNRS-ENGREF-INRA-UAG; French Guiana, France; AgroParisTech; LERFOB; UMR ENGREF INRA 1092; Ecole Nationale du Genie Rural; France."]

In [18]:
nomi_na_nf["22016614"][1]

['Department of Diagnostics and Plant Pathophysiology, University of Warmia and Mazury, Plac Lodzki 5, 10-957, Olsztyn, Poland; E-Mails: agnieszka.pszczolkowska@uwm.edu.pl (A.P.); macieklojko@wp.pl (M. L.).']

In [19]:
nomi_na_nf["9931476"][1]

['Otto-Warburg-Center for Agricultural Biotechnology, The Hebrew University of Jerusalem, Faculty of Agriculture, PO Box 12, Rehovot 76100, Israel.cohenk@agri.huji.ac.il']

In [20]:
nomi_na_nf["21339975"][1]

['Biotechnology Research Centre, Malaysian Agricultural Research and Development Institute, P.O Box 12301, General Post Office, 50774 Kuala Lumpur, Malaysia; E-Mails: hamidunb@yahoo.com (H.B.); rohaiza@mardi.gov.my (R.A.R.); indu@mardi.gov.my (I.B.S.J.).']

In [21]:
nomi_na_nf["20184044"][1]

['School of Biosciences and Technology, Vellore Institute of Technology (VIT) University, Vellore-632 014, (T.N.) India. deepakganjawala73@yahoo.com']

In [22]:
#22303203, loc:Facsimile: (301) 838 0208
nomi_na_nf["22303203"][1]

['The Institute for Genomic Research, 9712 Medical Center Drive, Rockville MD 20850, , Facsimile: (301) 838 0208, rbuell@tigr.org.']

### Check issues found in 7_1c

In [103]:
ci['16656734']

KeyError: '16656734'

In [101]:
add_str = nomi_na_nf['16656734'][1][0]
get_location_str(add_str, token_idx=-1, debug=1)

KeyError: '16656734'

## ___Test run nominatim and rectify issue___

### Call nominatim

```
  Args:
    nominatim_nf (dict): {pmid:[AU,AD]}, record still not found
    not_found (dict): {pmid:[AU, AD]}, for records with no a3 code after
      pycountry search
    suppl_dict (dict): regions that do not have proper pycountry info
    dir_pmid_log (Path): path to pmid log file of completed searches
    dir_out (Path): path to geolocator outputs
    sleep_time (int): time in second between queries.
  Return:
    nominatim_nf (dict): {pmid:[AU, AD]} for pmids with NA as search results
```

In [None]:
# Define output dir
dir_test_out = work_dir / "test_out"

# Define pmid log file
dir_test_log = work_dir / "log_test_pmids"

# nominatim search north america, record not found
test_na_nf = {}

test_na_nf = call_nominatim(nomi_na_nf, not_found, suppl_dict, 
                                 dir_pmid_log, dir_nominatim_na_out, 0.1)

## ___Deprecated function___

In [None]:
def get_location_str(AD, token_idx=-1, debug=0):
  '''Get the potential location string from AD
  Args:
    AD (str): The content in the AD field
    token_idx (int): -1, -2, or 0 (whole thing)
  Return
    location (str): the string that likely contain location info
    errflag (int): the AD info is empty and thus erroneous (1) or not (0)
  '''

  # The first element in the AD list is used (1st author)
  add_str  = AD

  if debug: print("add_str:", add_str)

  # But there are 12 records where the AD field looks like:
  # ['.', '.', '.', '.', '.', '.']
  # So tokens will be "", dealt with in the if-else statement below.
  if add_str == "":
    loc = "NA"
    errflag = 1

  else:
    errflag = 0

    # Multipe authors:
    # ['From xxx, xxx, xxx, xxx, xxx (Ranade, Ganea, Razzak, and Garcia Gil)']
    # Another case:
    # [['xxx, xx, xxx, Maryland 20742 (M.H., G.F.D.).']
    if add_str[-1] == ")" or add_str[-2:] == ").":
      leftMargin = add_str.rfind(" (")
      add_str = add_str[:leftMargin]

    # Email field: Was contemplating using email for country, but some 1st
    # author emails are not the same as for the institution (see 2nd example).

    # 18930883, loc:China E-mail: suzhi1026@163.com
    # Some have ". Email:..."
    if add_str.find("E-mail:") != -1:
      tmp_str = add_str[:add_str.find("E-mail:")]
      # Use space as delimiter, then the empty space is taken care of later
      leftMargin = tmp_str.rfind(" ")
      add_str = add_str[:leftMargin]
      if debug: print("found: 'E-mail:", add_str)

    # 22016614, loc:Poland; E-Mails: agnieszka.pszczolkowska@uwm.edu.pl (A.P.); macieklojko@wp.p
    # 22072902, loc:China; E-Mail: yiruizao@163.com
    elif add_str.find("; E-Mail") != -1:
      add_str = add_str[:add_str.find("; E-Mail")]
      if debug: print("found: '; E-Mail", add_str)

    # 24866837:Fax: (+31) 50-3636440  <-- AG Groningen (The Netherlands), Fax:
    elif add_str.find("Fax:") != -1:
      add_str = add_str[:add_str.find("Fax:")]
      if debug: print("found: 'Fax:", add_str)

    # 28250584:GRID: grid.440587.a	XX, XX, XX, Para Brazil. GRID: grid.440587.a
    if add_str.find(" GRID:") != -1:
      add_str = add_str[:add_str.find(" GRID:")]
      if debug: print("found: ' GRID:", add_str)

    # ISNI code in ~7k records
    #   30263677 ['27601 Republic of Korea. ISNI: 0000 0004 1775 9398. GRID: grid.444122.5'
    if add_str.find("ISNI:") != -1:
      tmp_str = add_str[:add_str.find("ISNI:")]
      leftMargin = tmp_str.rfind(".")
      add_str = add_str[:leftMargin]
      
    # Below are more example for using @ field for parsing.
    #   case 1: 17296497 ['xxx, Denmark. blah@aki.ku.dk <blah@aki.ku.dk>']
    #   case 2: ?? ['Institut ..., France. achmustilli@libero.it']
    # The next one is weird, look like there is something not parsed properly
    #   case 3: 17632571 ['xxx, xxx, UK. ib 103@mole.bio.cam.ac.uk']
    # Also the next one, so cannot use "." as delimiter first.
    #   case 4: 18613594 ['xxx, ACT2601 Australia. rod.mahon@csiro.au']
    # Ok, some is missing space between country and email... Man...
    #   case 5: 9931476 ['PO Box 12, Rehovot 76100, Israel.cohenk@agri.huji.ac.il']
    #   case 6: 27172200:Tianjin P.R	XX, Tianjin P.R., 300384 China tongjiping@sina.com goodrice@263.net.
    if add_str.find("@") != -1:
      if debug: print("found: @", add_str)

      # Find where the 1st email address is and generate a temp_str
      tmp_str = add_str[:add_str.find("@")]
      if debug: print(" ", tmp_str)

      # Originally using space, but the 3rd example shows that it is not good.
      # But then "." is regularly used in email address. So do space delimter
      # first, then use "."
      tmp_str = tmp_str[:tmp_str.rfind(" ")] # this takes care of case 4, 6
      if debug: print(" ", tmp_str)

      # Will not deal with case 3 and 5, this break things, like case 6
      #leftMargin = tmp_str.rfind(".")        # this takes care of case 3
      #if leftMargin == -1:                   # this takes care of case 5
      #  leftMargin = tmp_str.rfind(",")
      #add_str = tmp_str[leftMargin+1:]
      add_str = tmp_str
      
    # Strip empty space before
    add_str = add_str.strip()

    if debug: print("final add_str:",[add_str])

    # Some just have email address so after parsing, add_str is "".
    #   25548975: '. kehrig@pharmazie.uni-kiel.de.'
    if add_str == "":
      loc = "NA"
      errflag = 1
    # if tokens ends with ".", rid of it
    else:
      if add_str[-1] == ".": 
        add_str = add_str[:-1]

      # Originally splot with ", " then "." but there are edge cases like this:
      #   17444520, loc:Hsinchu,Taiwan
      # So split with ",", if it does not exist, split with " "
      if "," not in add_str:
        # Only one large token, split with space instead
        tokens = add_str.split(" ")
        try:
          loc = tokens[token_idx]
        except IndexError:
          loc = "NA"
          errflag = 1
      else:
        tokens = add_str.split(",")
        try:
          # rid of space if present
          loc = tokens[token_idx].strip()
        except IndexError:
          loc = "NA"
          errflag = 1

      if debug: print("step1 loc:",[loc])

      # Edge cases with "XXX. XXX"
      # 17365182:110016. China	['Research Department of Natural Medicine, Shenyang Pharmaceutical University. Shenyang, 110016. China.']
      # 17518112:Lupaszigeti ut 4.-2011	['Gyogynoveny Kutato Intezet Zrt., Budakalasz, Lupaszigeti ut 4.-2011.']
      # 17851392:msvcrt.P. India	['School of Studies in Chemistry, Vikram University, Ujjain 456010, M.P. India. ksrao7709@rediffmail.com']
      if loc.find(".") != -1:
        loc = loc.split(". ")[-1]

      if debug: print("step2 loc:",[loc])

      # More edge cases with "(" and some with ")", examples:
      # case 1: 18636686:47023 Cesena (FC) Italy
      # case 2: 19704524:Ibaraki Japan; xxx (B & PMP); xxx; xxx; Montpellier France
      # case 3: 21665592:B-1860 Meise (Belgium);
      # case 4: 24828308:Japan (K.S	
      # case 5: 27789739:A.I.);	XX, Australia (M.W.M., S.K.-J., A.I.); monika.murcha@uwa.edu.au.
      if loc.find("(") != -1:
        if debug: print("step1 add_str:",[add_str])
        if loc.find(")") != -1:
          # case 1
          if loc.find(") ") != -1:
            loc = loc.split(") ")[-1]
          # case 2, 3
          else:
            loc = loc.split("(")[-1].split(")")[0]
        # case 4
        else:
          loc = loc.split(" (")[0]

      if debug: print("step3 loc:",[loc])

      # 19140172, loc:IR Iran, there are other variations. OpenStreeMap cannot
      # find these so deal with them manually.
      if loc.find("Iran") != -1:
        loc = "Iran"

      # 19651701, loc:Taiwan ROC
      if loc.endswith(" ROC"):
        loc = loc[:loc.rfind(" ")]

      # 21299880, loc:DF- 70770-917 - Brasil
      if loc.find("- ") != -1:
        loc = loc.split("- ")[-1]

      # 1915409, loc:Stuttgart/Bundesrepublik Deutschland
      if loc.find("/") != -1:
        loc = loc.split("/")[-1]

      if loc.endswith("."):
        loc = loc[:-1]

    if debug: print(loc)

  return loc, errflag
