# __Step 7.2: Consolidate nominatim outputs__

Goal
- With the pycountry-based country info and nominatim runs, combine the info.

## ___Set up___

### Module import

In conda env `base`

In [1]:
import pickle, glob
from pathlib import Path
from tqdm import tqdm
from multiprocessing import Pool

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
base_dir   = proj_dir / "7_countries"
work_dir   = base_dir / "7_2_nomi_consolidate"
work_dir.mkdir(parents=True, exist_ok=True)

dir71      = base_dir / "7_1_parse_countries"

# country info based on pycountry
ci_pyc_file  = dir71 / "country_info-pycountry.pickle"

# Nominatim out directories
nomi_out_dirs = [
      dir71 / "nominatim_na_out" , dir71 / "nominatim_na_out2",
      dir71 / "nominatim_as_out" , dir71 / "nominatim_as_out2",
      dir71 / "nominatim_eu1_out", dir71 / "nominatim_eu1_out2",
      dir71 / "nominatim_eu2_out", dir71 / "nominatim_eu2_out2", 
      dir71 / "nominatim_ao_out" , dir71 / "nominatim_ao_out2"]

## ___Go through nominatim result files___

### Function for parsing nomi file

In [3]:
def parse_nomi_file(nomi_file):
  '''
  Args:
    nomi_file (str): path to a Nominatim search output file with the following
      structure - 
  Return
    pmid (str): pubmed id
    AU (list): author list
    AD (list): author address list
    a3 (str): a3 country code, may be NA
    ni (float): importance of nominatim search result
  '''
  # Get pmid
  pmid = nomi_file.split("/")[-1]
  if pmid.find("_") == -1:
    pmid = pmid.split(".")[0]
  else:
    pmid = pmid.split("_")[0]

  # Read file and parse AD, AU, a3, and nominatim output dictionary
  with open(nomi_file, "r") as f:
    nomi = f.readline().split('\t')
  
  AU = eval(nomi[0])                 # authors
  AD = eval(nomi[1])                 # addresses
  a3 = nomi[2]                       # a3 code

  # If no info, nominatim importance (ni) is set to 0
  if a3 != "NA":
    ni = eval(nomi[3])['importance'] # nominatim out importance
  else:
    ni = 0

  return pmid, AU, AD, a3, ni

### Parallized parse_nomi_file calls

Some tips:
- [Super fast python](https://superfastpython.com/multiprocessing-for-loop/)
- [Multiprocessing for loop and dictionary](#https://stackoverflow.com/questions/60054676/using-python-multiprocessing-on-a-for-loop-that-appends-results-to-dictionary)
  - For nominatim_na_out with 89489 files:
    - Linear version: 249s
    - Parallel version 1: 84s
    - Parallel version 2: 75s
  - Faster but not that crazy. Was hoping the speed will be x16 because there are 16 cores.

In [4]:
# Parallel version 2: using Pool
def paralle_parse(nomi_out_dirs):
  '''
  Args:
    nomi_out_dirs (list): a list of directories with nomi output files
  Return:
    ci_nomi (dict): for storing AU, AD, importance and country info
  '''
  ci_nomi = {} # {pmid:[AU, AD, {importance:[countries]}]}

  # Go through each region run/rerun
  #for nomi_out_dir in test_dirs:
  c = 0
  for nomi_out_dir in nomi_out_dirs:
    nomi_files = glob.glob(f"{str(nomi_out_dir)}/*")
    print(str(nomi_out_dir).split("/")[-1], len(nomi_files))

    # Go through each file
    with Pool(None) as pool:
      for result in pool.imap_unordered(parse_nomi_file, nomi_files):
        pmid, AU, AD, a3, ni = result

        # Populate ci_nomi dictionary
        if pmid not in ci_nomi:
          ci_nomi[pmid] = [AU, AD, {ni:[a3]}]
        elif ni not in ci_nomi[pmid][2]:
          ci_nomi[pmid][2][ni] = [a3]
        elif ni != 0:
          ci_nomi[pmid][2][ni].append(a3) # same importance

  return ci_nomi

### Generate test data and run parallel parse

In [5]:
dir_test = dir71 / "old_results"

test_dirs = \
     [dir_test / "nominatim_na_out", dir_test / "nominatim_na_out2",
      dir_test / "nominatim_as_out", dir_test / "nominatim_as_out2"]

In [6]:
# Testing parallel_parse
test_ci_nomi = paralle_parse(test_dirs)

nominatim_na_out 89489


### Spot check ci_nomi

In [None]:
# check same importance but different countries
c = 0
for pmid in ci_nomi:
  ni_dict = ci_nomi[pmid][2]
  for ni in ni_dict:
    if len(ni_dict[ni]) > 1:
      #if c < 100:
      #  print(f"{pmid}:ni={ni},val={ni_dict[ni]}")
      c += 1

print(f"total={len(ci_nomi)}, same importance={c}")


In [None]:
# check # of pmids with >1 importance
cdict = {} # {num_ni: count}
for pmid in ci_nomi:
  ni_dict  = ci_nomi[pmid][2] # nominatim importance dictionary
  ni_count = len(ni_dict)
  if ni_count not in cdict:
    cdict[ni_count] = 1
  else:
    cdict[ni_count]+= 1

cdict

In [None]:
# Check # of pmids where USA importance is lower
c = 0
for pmid in ci_nomi:
  if len(ci_nomi[pmid][2]) > 1:
    ni_dict = ci_nomi[pmid][2]  
    imps = list(ni_dict.keys()) # importance values
    imps.sort()

    # Not the one with the highest importance
    if "USA" not in ni_dict[imps[-1]]:
      print(f"{pmid}:{ni_dict}")
      print(f"AD={ci_nomi[pmid][1]}")
      c += 1

print(f"total={len(ci_nomi)}, usa < other={c}")

In [None]:
# Examine a few examples
# Check these again:
'''
16656733
AD  =['Department of Biology, Queen Elizabeth College, (University of London), Campden Hill, London, W.8.']
nomi={0.4000099999999999: ['USA'], 0.20000999999999997: ['IND']}

16656734
AD  =['Department of Plant Physiology, Waite Agricultural Research Institute, Glen Osmond, South Australia.']
nomi={0.30000999999999994: ['USA'], 0.4000099999999999: ['IDN']}

16656755
AD  =['C.S.I.R.O. Division of Horticultural Research, Private Bag No. 1, Glen Osmond, South Australia.']
nomi={0.30000999999999994: ['USA'], 0.4000099999999999: ['IDN']}

16656757
AD  =['Institute of Biology, College of General Education (Kyoyo-gakubu), University of Tokyo, Komaba, Meguro, Tokyo.']
nomi={0.21000999999999995: ['USA'], 0.6600099999999999: ['JPN']}
'''
c = 0
for pmid in ci_nomi:
  if len(ci_nomi[pmid][2]) == 2:
    print(pmid)
    print(f"AD  ={ci_nomi[pmid][1]}")
    print(f"nomi={ci_nomi[pmid][2]}\n")
    if c == 100:
      break
    c += 1

## ___Testing___

In [None]:
# For deprecated codes in Testing
from concurrent.futures import ProcessPoolExecutor, as_completed

### Deprecated functions

In [None]:
# Not parallelized version

ci_nomi = {} # {pmid:[AU, AD, {importance:[countries]} ]}

# Go through each region run/rerun
for nomi_out_dir in nomi_out_dirs[:1]:
  nomi_files = glob.glob(f"{str(nomi_out_dir)}/*")
  print(str(nomi_out_dir).split("/")[-1], len(nomi_files))

  # Go through each file
  # AU, AD, a3, nomi_out 
  for nomi_file in tqdm(nomi_files):
    pmid, AU, AD, a3, ni = parse_nomi_file(nomi_file)  

    # Populate ci_nomi dictionary
    if pmid not in ci_nomi:
      ci_nomi[pmid] = {ni:[AU, AD, [a3]]}
    elif ni not in ci_nomi[pmid]:
      ci_nomi[pmid][ni] = [AU, AD, [a3]]
    elif ni != 0:
      ci_nomi[pmid][ni][2].append(a3) # same importance

In [None]:
# Parallel version 1: using ProcessPoolExecutor

ci_nomi = {} # {pmid:[AU, AD, {importance:[countries]} ]}

# Go through each region run/rerun
#for nomi_out_dir in test_dirs:
c = 0
for nomi_out_dir in nomi_out_dirs[:1]:
  nomi_files = glob.glob(f"{str(nomi_out_dir)}/*")
  print(str(nomi_out_dir).split("/")[-1], len(nomi_files))

  # Go through each file
  # AU, AD, a3, nomi_out 
  with ProcessPoolExecutor() as executor:
    futures = {executor.submit(parse_nomi_file, nomi_file): 
                                      nomi_file for nomi_file in nomi_files}
    
    for future in as_completed(futures):
      pmid, AU, AD, a3, ni = future.result()

      # Populate ci_nomi dictionary
      if pmid not in ci_nomi:
        ci_nomi[pmid] = [AU, AD, {ni:[a3]}]
      elif ni not in ci_nomi[pmid][2]:
        ci_nomi[pmid][2][ni] = [a3]
      elif ni != 0:
        ci_nomi[pmid][2][ni].append(a3) # same importance