# __Step 7.1d: Consolidate country info__

Goal
- With the pycountry-based country info and nominatim runs, combine the info.

## ___Set up___

### Module import

In conda env `base`

In [45]:
import pickle, glob
from pathlib import Path
from tqdm import tqdm
from multiprocessing import Process

### Key variables

In [21]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "7_countries"
work_dir.mkdir(parents=True, exist_ok=True)

# country info based on pycountry
ci_pyc_file  = work_dir / "country_info-pycountry.pickle"

# Nominatim out directories
nomi_out_dirs = [
      work_dir / "nominatim_na_out" , work_dir / "nominatim_na_out2",
      work_dir / "nominatim_as_out" , work_dir / "nominatim_as_out2",
      work_dir / "nominatim_eu1_out", work_dir / "nominatim_eu1_out2",
      work_dir / "nominatim_eu2_out", work_dir / "nominatim_eu2_out2", 
      work_dir / "nominatim_ao_out" , work_dir / "nominatim_ao_out2"
]

## ___Process pycountry country info___

In [8]:
with open(ci_pyc_file, "rb") as f:
  ci_pyc = pickle.load(f)

In [10]:
pmids_ci_pyc = list(ci_pyc.keys())
pmids_ci_pyc[0], ci_pyc[pmids_ci_pyc[0]]

('61', [['Haveman J', 'Lavorel J'], 'NA', 'NA'])

In [13]:
# Create a combined ci dictionary, exclude entries with no AD info
ci_combo = {} # {pmid:[AU, AD, a3]}
c_bad    = 0  # count bad
for pmid in ci_pyc:
  if ci_pyc[pmid][1] != "NA":
    ci_combo[pmid] = ci_pyc[pmid]
    # This should not happen
    if ci_pyc[pmid][2] == "NA":
      print("ERROR:", ci_pyc[pmid][1:])
  else:
    c_bad += 1

print(f"With info:{len(ci_combo)}, without: {c_bad}")

With info:328923, without: 19851


## ___Go through nominatim result files___

### Test with North America and Asia runs

In [55]:
def parse_nomi_file(nomi_file):
  '''
  Args:
    nomi_file (str): path to a Nominatim search output file
  Return
    pmid (str): pubmed id
    AU (list): author list
    AD (list): author address list
    a3 (str): a3 country code, may be NA
    ni (float): importance of nominatim search result
  '''
  # Get pmid
  pmid = nomi_file.split("/")[-1]
  if pmid.find("_") == -1:
    pmid = pmid.split(".")[0]
  else:
    pmid = pmid.split("_")[0]

  # Read file and parse AD, AU, a3, and nominatim output dictionary
  with open(nomi_file, "r") as f:
    nomi = f.readline().split('\t')
  
  AU = eval(nomi[0])                 # authors
  AD = eval(nomi[1])                 # addresses
  a3 = nomi[2]                       # a3 code

  # If no info, nominatim importance (ni) is set to 0
  if a3 != "NA":
    ni = eval(nomi[3])['importance'] # nominatim out importance
  else:
    ni = 0

  return pmid, AU, AD, a3, ni

In [56]:
# Speed up:
#https://superfastpython.com/multiprocessing-for-loop/
#https://stackoverflow.com/questions/60054676/using-python-multiprocessing-on-a-for-loop-that-appends-results-to-dictionary

from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import defaultdict

ci_nomi = {} # {pmid:[AU, AD, {importance:[countries]} ]}

# Go through each region run/rerun
for nomi_out_dir in nomi_out_dirs[:1]:
  nomi_files = glob.glob(f"{str(nomi_out_dir)}/*")
  print(str(nomi_out_dir).split("/")[-1], len(nomi_files))

  # Go through each file
  # AU, AD, a3, nomi_out 
  with ProcessPoolExecutor() as executor:
    futures = {executor.submit(parse_nomi_file, nomi_file): 
                                      nomi_file for nomi_file in nomi_files}
    
    for future in as_completed(futures):
      try:
        pmid, AU, AD, a3, ni = future.results()
      except Exception as e:
        print(f"{futures[future]} throws {e}")
      else:
        # Populate ci_nomi dictionary
        if pmid not in ci_nomi:
          ci_nomi[pmid] = {ni:[AU, AD, [a3]]}
        elif ni not in ci_nomi[pmid]:
          ci_nomi[pmid][ni] = [AU, AD, [a3]]
        elif ni != 0:
          ci_nomi[pmid][ni][2].append(a3) # same importance

nominatim_na_out 89489


KeyboardInterrupt: 

In [54]:
len(ci_nomi)

0

In [44]:
for pmid in ci_nomi:
  if len(ci_nomi[pmid]) > 1:
    print(pmid)
    for imp in ci_nomi[pmid]:
      print(f" {imp}:{ci_nomi[pmid][imp][2]}")

16656712
 0.445:['USA']
 0.3100099999999999:['IND']
16656713
 0.535:['USA']
 0.20000999999999997:['ARE']
16656714
 0.445:['USA']
 0.10000999999999996:['PHL']
16656715
 0.445:['USA']
 0.3100099999999999:['IND']
16656716
 0.445:['USA']
 0.21000999999999995:['PHL']
16656718
 0.445:['USA']
 0.10000999999999996:['JPN']
16656719
 0.445:['USA']
 0:['NA']
16656720
 0.445:['USA']
 0:['NA']
16656725
 0.445:['USA']
 0.3100099999999999:['IND']
16656726
 0.445:['USA']
 0.21000999999999995:['PHL']
16656733
 0.4000099999999999:['USA']
 0.20000999999999997:['IND']
16656734
 0.30000999999999994:['USA']
 0.4000099999999999:['IDN']
16656735
 0.6600099999999999:['USA']
 0.36000999999999994:['ARE']
16656737
 0.6600099999999999:['USA']
 0.38500999999999996:['PHL']
16656738
 0.535:['USA']
 0.30000999999999994:['PHL']
16656740
 0.445:['USA']
 0.21000999999999995:['PHL']
16656742
 0.535:['USA']
 0.30000999999999994:['PHL']
16656743
 0.6600099999999999:['USA']
 0.36000999999999994:['ARE']
16656747
 0.445:['USA'

## ___Testing___

In [None]:
# Not parallelized version

ci_nomi = {} # {pmid:[AU, AD, {importance:[countries]} ]}

# Go through each region run/rerun
for nomi_out_dir in nomi_out_dirs[:1]:
  nomi_files = glob.glob(f"{str(nomi_out_dir)}/*")
  print(str(nomi_out_dir).split("/")[-1], len(nomi_files))

  # Go through each file
  # AU, AD, a3, nomi_out 
  for nomi_file in tqdm(nomi_files):
    pmid, AU, AD, a3, ni = parse_nomi_file(nomi_file)  

    # Populate ci_nomi dictionary
    if pmid not in ci_nomi:
      ci_nomi[pmid] = {ni:[AU, AD, [a3]]}
    elif ni not in ci_nomi[pmid]:
      ci_nomi[pmid][ni] = [AU, AD, [a3]]
    elif ni != 0:
      ci_nomi[pmid][ni][2].append(a3) # same importance