# __Step 7.4: Consolidate all outputs__

Goal
- With the pycountry-based country info and nominatim runs, combine the info.

## ___Set up___

### Module import

In conda env `base`

In [None]:
import pickle, glob
from pathlib import Path
from tqdm import tqdm
from multiprocessing import Pool

### Key variables

In [None]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
base_dir   = proj_dir / "7_countries"
work_dir   = base_dir / "7_2_nomi_consolidate"
work_dir.mkdir(parents=True, exist_ok=True)

dir71      = base_dir / "7_1_parse_countries"

# country info based on pycountry
ci_pyc_file  = dir71 / "country_info-pycountry.pickle"

# Nominatim out directories
nomi_out_dirs = [
      dir71 / "nominatim_na_out" , dir71 / "nominatim_na_out2",
      dir71 / "nominatim_as_out" , dir71 / "nominatim_as_out2",
      dir71 / "nominatim_eu1_out", dir71 / "nominatim_eu1_out2",
      dir71 / "nominatim_eu2_out", dir71 / "nominatim_eu2_out2", 
      dir71 / "nominatim_ao_out" , dir71 / "nominatim_ao_out2"]

dir_test = dir71 / "old_results"

test_dirs = \
     [dir_test / "nominatim_na_out", dir_test / "nominatim_na_out2",
      dir_test / "nominatim_as_out", dir_test / "nominatim_as_out2"]

## ___Process pycountry country info___

In [None]:
with open(ci_pyc_file, "rb") as f:
  ci_pyc = pickle.load(f)

In [None]:
pmids_ci_pyc = list(ci_pyc.keys())
pmids_ci_pyc[0], ci_pyc[pmids_ci_pyc[0]]

('61', [['Haveman J', 'Lavorel J'], 'NA', 'NA'])

In [None]:
# Create a combined ci dictionary to store info from nominatim runs,
# exclude entries with no AD info
ci_combo = {} # {pmid:[AU, AD, a3]}
c_bad    = 0  # count bad
for pmid in ci_pyc:
  if ci_pyc[pmid][1] != "NA":
    ci_combo[pmid] = ci_pyc[pmid]
    # This should not happen
    if ci_pyc[pmid][2] == "NA":
      print("ERROR:", ci_pyc[pmid][1:])
  else:
    c_bad += 1

print(f"With info:{len(ci_combo)}, without: {c_bad}")

With info:329024, without: 19851


## ___Combine pycountry and nominatim country info___

### Considerations

Obj format
- ci_combo = {pmid:[AU, AD, a3]}
- ci_nomi = {pmid:[AU, AD, {importance:[countries]} ]}

Note:
- Nominatim is supposed to run on records that do not have pycountry matches.
- But when I run North America, I discovered some issues with pycountry matching part, so there were records with nominatim run but later on found to have pycountry matches.
- So I need to make sure pycountry match is prioritized when there is also a nominatim match.

In [None]:
len(ci_combo), len(ci_nomi)

### Merging

In [None]:
# Go through ci_nomi and put record into ci_combo
for pmid in ci_nomi:
  [AU, AD, ni_dict] = ci_nomi[pmid]

  # Importance values
  imps = list)ni_dict.keys()
  a3   = ni_dict[imps[-1]]
  if pmid not in ci_combo:
    if len(a3) > 1:
      print(f"pmid:same importance - {a3}")
    ci_combo[pmid] = [AU, AD, a3]

len(ci_combo)

In [None]:
# Save ci_combo
ci_combo_file = work_dir / "country_info-combo.pickle"
with open(ci_combo_file, "wb") as f:
  pickle.dump(ci_combo, f)

In [None]:
# load ci_combo file to make sure it is ok
with open(ci_combo_file, "rb") as f:
  ci_combo_reload = pickle.load(f)

len(ci_combo)

## ___Combine pycountry and nominatim country info___

### Considerations

Obj format
- ci_combo = {pmid:[AU, AD, a3]}
- ci_nomi = {pmid:[AU, AD, {importance:[countries]} ]}

Note:
- Nominatim is supposed to run on records that do not have pycountry matches.
- But when I run North America, I discovered some issues with pycountry matching part, so there were records with nominatim run but later on found to have pycountry matches.
- So I need to make sure pycountry match is prioritized when there is also a nominatim match.

In [None]:
len(ci_combo), len(ci_nomi)

### Merging

In [None]:
# Go through ci_nomi and put record into ci_combo
for pmid in ci_nomi:
  [AU, AD, ni_dict] = ci_nomi[pmid]

  # Importance values
  imps = list)ni_dict.keys()
  a3   = ni_dict[imps[-1]]
  if pmid not in ci_combo:
    if len(a3) > 1:
      print(f"pmid:same importance - {a3}")
    ci_combo[pmid] = [AU, AD, a3]

len(ci_combo)

In [None]:
# Save ci_combo
ci_combo_file = work_dir / "country_info-combo.pickle"
with open(ci_combo_file, "wb") as f:
  pickle.dump(ci_combo, f)

In [None]:
# load ci_combo file to make sure it is ok
with open(ci_combo_file, "rb") as f:
  ci_combo_reload = pickle.load(f)

len(ci_combo)

### Spot check ci_nomi

In [None]:
# check same importance but different countries
c = 0
for pmid in ci_nomi:
  ni_dict = ci_nomi[pmid][2]
  for ni in ni_dict:
    if len(ni_dict[ni]) > 1:
      #if c < 100:
      #  print(f"{pmid}:ni={ni},val={ni_dict[ni]}")
      c += 1

print(f"total={len(ci_nomi)}, same importance={c}")


In [None]:
# check # of pmids with >1 importance
cdict = {} # {num_ni: count}
for pmid in ci_nomi:
  ni_dict  = ci_nomi[pmid][2] # nominatim importance dictionary
  ni_count = len(ni_dict)
  if ni_count not in cdict:
    cdict[ni_count] = 1
  else:
    cdict[ni_count]+= 1

cdict

In [None]:
# Check # of pmids where USA importance is lower
c = 0
for pmid in ci_nomi:
  if len(ci_nomi[pmid][2]) > 1:
    ni_dict = ci_nomi[pmid][2]  
    imps = list(ni_dict.keys()) # importance values
    imps.sort()

    # Not the one with the highest importance
    if "USA" not in ni_dict[imps[-1]]:
      print(f"{pmid}:{ni_dict}")
      print(f"AD={ci_nomi[pmid][1]}")
      c += 1

print(f"total={len(ci_nomi)}, usa < other={c}")

In [None]:
# Examine a few examples
# Check these again:
'''
16656733
AD  =['Department of Biology, Queen Elizabeth College, (University of London), Campden Hill, London, W.8.']
nomi={0.4000099999999999: ['USA'], 0.20000999999999997: ['IND']}

16656734
AD  =['Department of Plant Physiology, Waite Agricultural Research Institute, Glen Osmond, South Australia.']
nomi={0.30000999999999994: ['USA'], 0.4000099999999999: ['IDN']}

16656755
AD  =['C.S.I.R.O. Division of Horticultural Research, Private Bag No. 1, Glen Osmond, South Australia.']
nomi={0.30000999999999994: ['USA'], 0.4000099999999999: ['IDN']}

16656757
AD  =['Institute of Biology, College of General Education (Kyoyo-gakubu), University of Tokyo, Komaba, Meguro, Tokyo.']
nomi={0.21000999999999995: ['USA'], 0.6600099999999999: ['JPN']}
'''
c = 0
for pmid in ci_nomi:
  if len(ci_nomi[pmid][2]) == 2:
    print(pmid)
    print(f"AD  ={ci_nomi[pmid][1]}")
    print(f"nomi={ci_nomi[pmid][2]}\n")
    if c == 100:
      break
    c += 1