# __Step 7.4: Consolidate all outputs__

Goal
- Combine the information from:
  - pycountry run
  - Nominatim outputs
  - Brute force search and email search'
- Generate a country info matrix and save as tab delimited file
- Generate a {pmid:country} dictionary based on the considerations below.

Considerations:
- 1st order of priority:
  - pycountry
  - Brute force current country
  - Brute force historical country
- For records without country info
  - Assess false positive rates for:
    - Brute force subregion-based country
    - Email-based country
    - Nominatime based country
  - Determine 2nd order of priority and assign country

## ___Set up___

### Module import

In conda env `base`

In [None]:
import pickle, glob
from pathlib import Path
from tqdm import tqdm
from multiprocessing import Pool

### Key variables

In [None]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
base_dir   = proj_dir / "7_countries"
work_dir   = base_dir / "7_4_consolidate_all"
work_dir.mkdir(parents=True, exist_ok=True)

dir71      = base_dir / "7_1_parse_countries"
dir72      = base_dir / "7_2_nomi_consolidate"
dir73      = base_dir / "7_3_brute_force_and_email"

# all records
dict_pmid_au_ad_file = dir71 / "dict_pmid_AU_AD.pickle"

# country info 
ci_pyco_file  = dir71 / "country_info-pycountry.pickle"
ci_nomi_file  = dir72 / "country_info-nominatim.pickle"
ci_brut_file  = dir73 / "dict_pmid_au_ad_BRUTE.pickle"

## ___Read relevant files___

### All records

dict_pmid_au_ad = {pmid:[AD, AU]}

In [None]:
with open(dict_pmid_au_ad_file, "rb") as f:
  dict_pmid_au_ad = pickle.load(f)

### pycountry

```Python
ci_pyco = {pmid:[first_AU, first_AD, alpha_3]}
```

In [None]:
with open(ci_pyco_file, "rb") as f:
  ci_pyco = pickle.load(f)

### Nomnatim output

```Python
ci_nomi = {pmid:[AU, AD, alpha3, {importance:[alpha_3s]}]}
```

In [None]:
with open(ci_nomi_file, "rb") as f:
  ci_nomi = pickle.load(f)

### Brute force and email search

```Python
ci_brut = {pmid:
            [AU, AD, 
              {"country":XX, "country_hist":XX, "subregion":XX, "email":XX}
            ]
          }
```

In [None]:
with open(ci_brut_file, "rb") as f:
  ci_brut = pickle.load(f)

## ___Build output table___

### Populate output dictionary with a3 values

In [None]:
# {pmid:
#   [AU, AD,
#        final_a3                              index=2
#        pycountry_out,                              3
#        brute_force_current_country,                4
#        brute_force_historical_country,             5
#        brute_force_subregion-based_country,        6
#        email-based_country,                        7
#        nomi_out,                                   8
#   ]
# }

ci_combo = {} 
for pmid in dict_pmid_au_ad:
  AU = dict_pmid_au_ad[pmid][0]
  AD = dict_pmid_au_ad[pmid][1]
  if AD != "NA":
    ci_combo[pmid] = [AU, AD] + [""]*7

    if pmid in ci_pyco:
      ci_combo[pmid][3] = ci_pyco[pmid][2]
    
    if pmid in ci_brut:
      d_brut = ci_brut[pmid][2]
      if 'country' in d_brut
        ci_combo[pmid][4] = d_brut['country']
      if 'country_hist' in d_brut
        ci_combo[pmid][5] = d_brut['country_hist']
      if 'subregion' in d_brut
        ci_combo[pmid][6] = d_brut['subregion']
      if 'email' in d_brut
        ci_combo[pmid][7] = d_brut['email']

    if pmid in ci_nomi:
      ci_combo[pmid][8] = ci_nomi[pmid][2]

### Determine consistency between pyco and brut_country

Establish a dictionary `ci_benchmark` where a record is included if pycounry and brute force search country are the same.

In [None]:
count_a3_pyco   = 0 # number of records with a3 in pycounrtry search
count_a3_curr   = 0 # in brute force current country search
count_a3_hist   = 0 # in brute force historical country search
count_pyco_curr = 0 # consistent between pyco and curr
count_pyco_hist = 0 # consistent between pyco and hist
count_curr_hist = 0 # has both curr and hist, should not happen

ci_benchmark  = {} # {pmid:a3}
for pmid in ci_combo:
  a3_pyco = ci_combo[pmid][3]
  a3_curr = ci_combo[pmid][4]
  a3_hist = ci_combo[pmid][5]
  a3_pyco_flag = 0
  a3_curr_flag = 0
  a3_hist_flag = 0

  # Count and set flags
  if a3_pyco != "":
    count_a3_pyco += 1
    a3_pyco_flag = 1

  if a3_curr != "":
    count_a3_hist += 1
    a3_curr_flag = 1
  
  if a3_brut != "":
    count_a3_hist += 1
    a3_brut_flag = 1

  # Consistent between pyco and brute force current
  if a3_pyco_flag and a3_curr_flag:
    if a3_pyco == a3_curr:
      count_pyco_curr += 1
      ci_benchmark[pmid] = a3_pyco

  # Consistent between pyco and brute force historical
  if a3_pyco_flag and a3_hist_flag:
    if a3_pyco == a3_hist:
      count_pyco_hist += 1
      ci_benchmark[pmid] = a3_pyco

  # Have both brute force current and historical, should not happen
  if a3_curr_flag and a3_hist_flag:
    count_curr_hist += 1

print("Total pyco:", count_a3_pyco)
print(f" curr:{count_a3_curr}, same:{count_pyco_curr}")
print(f" hist:{count_a3_curr}, same:{count_pyco_hist}")
print("With curr and hist:", count_curr_hist)

### Determine FP rates

Assuming that pyco, brut_country, brut_historal_country are accurate.

## ___Combine pycountry and nominatim country info___

### Considerations

Obj format
- ci_combo = {pmid:[AU, AD, a3]}
- ci_nomi = {pmid:[AU, AD, {importance:[countries]} ]}

Note:
- Nominatim is supposed to run on records that do not have pycountry matches.
- But when I run North America, I discovered some issues with pycountry matching part, so there were records with nominatim run but later on found to have pycountry matches.
- So I need to make sure pycountry match is prioritized when there is also a nominatim match.

In [None]:
len(ci_combo), len(ci_nomi)

### Merging

In [None]:
# Go through ci_nomi and put record into ci_combo
for pmid in ci_nomi:
  [AU, AD, ni_dict] = ci_nomi[pmid]

  # Importance values
  imps = list)ni_dict.keys()
  a3   = ni_dict[imps[-1]]
  if pmid not in ci_combo:
    if len(a3) > 1:
      print(f"pmid:same importance - {a3}")
    ci_combo[pmid] = [AU, AD, a3]

len(ci_combo)

In [None]:
# Save ci_combo
ci_combo_file = work_dir / "country_info-combo.pickle"
with open(ci_combo_file, "wb") as f:
  pickle.dump(ci_combo, f)

In [None]:
# load ci_combo file to make sure it is ok
with open(ci_combo_file, "rb") as f:
  ci_combo_reload = pickle.load(f)

len(ci_combo)

## ___Combine pycountry and nominatim country info___

### Considerations

Obj format
- ci_combo = {pmid:[AU, AD, a3]}
- ci_nomi = {pmid:[AU, AD, {importance:[countries]} ]}

Note:
- Nominatim is supposed to run on records that do not have pycountry matches.
- But when I run North America, I discovered some issues with pycountry matching part, so there were records with nominatim run but later on found to have pycountry matches.
- So I need to make sure pycountry match is prioritized when there is also a nominatim match.

In [None]:
len(ci_combo), len(ci_nomi)

### Merging

In [None]:
# Go through ci_nomi and put record into ci_combo
for pmid in ci_nomi:
  [AU, AD, ni_dict] = ci_nomi[pmid]

  # Importance values
  imps = list)ni_dict.keys()
  a3   = ni_dict[imps[-1]]
  if pmid not in ci_combo:
    if len(a3) > 1:
      print(f"pmid:same importance - {a3}")
    ci_combo[pmid] = [AU, AD, a3]

len(ci_combo)

In [None]:
# Save ci_combo
ci_combo_file = work_dir / "country_info-combo.pickle"
with open(ci_combo_file, "wb") as f:
  pickle.dump(ci_combo, f)

In [None]:
# load ci_combo file to make sure it is ok
with open(ci_combo_file, "rb") as f:
  ci_combo_reload = pickle.load(f)

len(ci_combo)

### Spot check ci_nomi

In [None]:
# check same importance but different countries
c = 0
for pmid in ci_nomi:
  ni_dict = ci_nomi[pmid][2]
  for ni in ni_dict:
    if len(ni_dict[ni]) > 1:
      #if c < 100:
      #  print(f"{pmid}:ni={ni},val={ni_dict[ni]}")
      c += 1

print(f"total={len(ci_nomi)}, same importance={c}")


In [None]:
# check # of pmids with >1 importance
cdict = {} # {num_ni: count}
for pmid in ci_nomi:
  ni_dict  = ci_nomi[pmid][2] # nominatim importance dictionary
  ni_count = len(ni_dict)
  if ni_count not in cdict:
    cdict[ni_count] = 1
  else:
    cdict[ni_count]+= 1

cdict

In [None]:
# Check # of pmids where USA importance is lower
c = 0
for pmid in ci_nomi:
  if len(ci_nomi[pmid][2]) > 1:
    ni_dict = ci_nomi[pmid][2]  
    imps = list(ni_dict.keys()) # importance values
    imps.sort()

    # Not the one with the highest importance
    if "USA" not in ni_dict[imps[-1]]:
      print(f"{pmid}:{ni_dict}")
      print(f"AD={ci_nomi[pmid][1]}")
      c += 1

print(f"total={len(ci_nomi)}, usa < other={c}")

In [None]:
# Examine a few examples
# Check these again:
'''
16656733
AD  =['Department of Biology, Queen Elizabeth College, (University of London), Campden Hill, London, W.8.']
nomi={0.4000099999999999: ['USA'], 0.20000999999999997: ['IND']}

16656734
AD  =['Department of Plant Physiology, Waite Agricultural Research Institute, Glen Osmond, South Australia.']
nomi={0.30000999999999994: ['USA'], 0.4000099999999999: ['IDN']}

16656755
AD  =['C.S.I.R.O. Division of Horticultural Research, Private Bag No. 1, Glen Osmond, South Australia.']
nomi={0.30000999999999994: ['USA'], 0.4000099999999999: ['IDN']}

16656757
AD  =['Institute of Biology, College of General Education (Kyoyo-gakubu), University of Tokyo, Komaba, Meguro, Tokyo.']
nomi={0.21000999999999995: ['USA'], 0.6600099999999999: ['JPN']}
'''
c = 0
for pmid in ci_nomi:
  if len(ci_nomi[pmid][2]) == 2:
    print(pmid)
    print(f"AD  ={ci_nomi[pmid][1]}")
    print(f"nomi={ci_nomi[pmid][2]}\n")
    if c == 100:
      break
    c += 1