# __Step 7.4: Consolidate all outputs__

Goal
- Combine the information from:
  - pycountry run
  - Nominatim outputs
  - Brute force search and email search'
- Generate a country info matrix and save as tab delimited file
- Generate a {pmid:country} dictionary based on the considerations below.

Considerations:
- 1st order of priority:
  - pycountry
  - Brute force current country
  - Brute force historical country
- For records without country info
  - Assess false positive rates for:
    - Brute force subregion-based country
    - Email-based country
    - Nominatime based country
  - Determine 2nd order of priority and assign country

Final stat
- With country info: 330187
- Without: 71239

## ___Set up___

### Module import

In conda env `base`

In [1]:
import pickle, glob
from pathlib import Path
from tqdm import tqdm
from multiprocessing import Pool

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
base_dir   = proj_dir / "7_countries"
work_dir   = base_dir / "7_4_consolidate_all"
work_dir.mkdir(parents=True, exist_ok=True)

dir71      = base_dir / "7_1_parse_countries"
dir72      = base_dir / "7_2_nomi_consolidate"
dir73      = base_dir / "7_3_brute_force_and_email"

# all records
dict_pmid_au_ad_file = dir71 / "dict_pmid_AU_AD.pickle"

# country info 
ci_pyco_file  = dir71 / "country_info-pycountry.pickle"
ci_nomi_file  = dir72 / "country_info-nominatim_all.pickle"
ci_brut_file  = dir73 / "dict_pmid_au_ad_BRUTE.pickle"

## ___Read relevant files___

### All records

dict_pmid_au_ad = {pmid:[AD, AU]}

In [3]:
with open(dict_pmid_au_ad_file, "rb") as f:
  dict_pmid_au_ad = pickle.load(f)

### pycountry

```Python
ci_pyco = {pmid:[first_AU, first_AD, alpha_3]}
```

In [4]:
with open(ci_pyco_file, "rb") as f:
  ci_pyco = pickle.load(f)

In [5]:
c = 0
for i in ci_pyco:
    print(i, ci_pyco[i])
    c += 1
    if c == 10: break

61 [['Haveman J', 'Lavorel J'], 'NA', 'NA']
67 [['Fluck RA', 'Jaffe MJ'], 'NA', 'NA']
283 [['Siddiqui KA', 'Banerjee AK'], 'NA', 'NA']
380 [['Ogiso T', 'Noda T', 'Sako Y', 'Kato Y', 'Aoyama M'], 'NA', 'NA']
385 [['Leung KH', 'Hinkle PC'], 'NA', 'NA']
466 [['Nakamura H', 'Suganuma A', 'Greenberg J'], 'NA', 'NA']
543 [['Benohr HC', 'Waller HD'], 'NA', 'NA']
580 [['Sanders TH', 'Pattee HE', 'Singleton JA'], 'NA', 'NA']
836 [['Konoplytska OL', 'Sytnyanska NP'], 'NA', 'NA']
990 [['Gregory P', 'Bradbeer JW'], 'NA', 'NA']


In [6]:
for i in ci_pyco:
  a3 = ci_pyco[i]
  if len(a3) != 3:
    print(a3)

### Nomnatim output

```Python
ci_nomi = {pmid:[AU, AD, {importance:[alpha_3s]}]}
```

Parse the above to generate:
```Python
ci_nomi_top = {pmid:[alpha_3s]}
```

Total:72401, multiple a3s:5727

In [7]:
with open(ci_nomi_file, "rb") as f:
  ci_nomi = pickle.load(f)

In [8]:
# Populate another dictionary with top country
ci_nomi_top = {} # {pmid:a3}
c_multi = 0
for pmid in ci_nomi:
  idict = ci_nomi[pmid][2]    # importance dictionary
  ilist = list(idict.keys())  # importance value list
  ilist.sort()                # sort importance
  a3s   = idict[ilist[-1]]    # a3s of the most important
  if len(a3s) > 1:
    c_multi += 1

  ci_nomi_top[pmid] = a3s

print(f"Total:{len(ci_nomi_top)}, multiple a3s:{c_multi}")

Total:72401, multiple a3s:5727


In [9]:
c = 0
for i in ci_nomi_top:
    print(i, ci_nomi_top[i])
    c += 1
    if c == 10: break

16668554 ['USA']
28677478 ['USA']
18424616 ['SGP']
19065767 ['BRA']
25901651 ['ESP']
24346792 ['BRA']
33294018 ['NA']
8376342 ['USA']
26417108 ['DEU']
29301956 ['USA']


### Brute force and email search

```Python
ci_brut = {pmid:
            [AU, AD, 
              {"country":XX, "country_hist":XX, "subregion":XX, "email":XX}
            ]
          }
```

In [10]:
with open(ci_brut_file, "rb") as f:
  ci_brut = pickle.load(f)

In [11]:
c = 0
for i in ci_brut:
  if len(ci_brut[i]) == 3:
    if ci_brut[i][2]['country'] != "":
       c += 1

print(c)

361242


## ___Consolidate info into one dictionary___

### Populate output dictionary with a3 values

In [12]:
# {pmid:
#   [AU, AD,
#        final_a3                              index=2
#        pycountry_out,                              3
#        brute_force_current_country,                4
#        brute_force_historical_country,             5
#        brute_force_subregion-based_country,        6
#        email-based_country,                        7
#        nomi_out,                                   8
#   ]
# }

ci_combo = {} 
for pmid in dict_pmid_au_ad:
  AU = dict_pmid_au_ad[pmid][0]
  AD = dict_pmid_au_ad[pmid][1]
  if AD != "NA":
    ci_combo[pmid] = [AU, AD] + [""]*7

    if pmid in ci_pyco:
      ci_combo[pmid][3] = ci_pyco[pmid][2]
    
    if pmid in ci_brut:
      d_brut = ci_brut[pmid][2]
      if 'country' in d_brut:
        ci_combo[pmid][4] = d_brut['country']
      if 'country_hist' in d_brut:
        ci_combo[pmid][5] = d_brut['country_hist']
      if 'subregion' in d_brut:
        ci_combo[pmid][6] = d_brut['subregion']
      if 'email' in d_brut:
        ci_combo[pmid][7] = d_brut['email']

    if pmid in ci_nomi_top:
      ci_combo[pmid][8] = ci_nomi_top[pmid]

### Determine consistency between pyco and brut_country

(total_both_non_NA:consistent_a3)
- pycountry vs brute force country = (328312, 318902, '97.1%')
- pycountry vs brute force subregi = (229221, 97042, '42.3%')
- pycountry vs email = (63600, 59741, '93.9%')
- brute force country vs subregion = (254596, 101472, '39.8%')
- brute force country vs email = (69993, 64681, '92.4%')
- brute force region vs email = (47119, 22582, '47.9%')

Considerations:
- pycountry and bf country has the highest degree of agreement, they are the two best.
  - pycountry has a slightly better agreement to email and subregion compared to bf country, but not by much. 
- email has a strong agreement with pycountry and bf country
  - But slightly lower agreement compared to that between pycountry and bf country, so considered 3rd best.
- subregion has low agreement with all three
  - info only as secondary evidence for verification purpose
- Nominatim result: see next section.

In [13]:
# count # of non NA entries for each type of country info
def count_non_na(ci_combo, idx):
  count = 0
  for pmid in ci_combo:
    if ci_combo[pmid][idx] != "":
      count += 1
  return count

In [14]:
print(count_non_na(ci_combo, 3)) # pcountry
print(count_non_na(ci_combo, 4)) # brute force country
print(count_non_na(ci_combo, 5)) #             historical
print(count_non_na(ci_combo, 6)) #             subregion
print(count_non_na(ci_combo, 7)) #             email
print(count_non_na(ci_combo, 8)) # nomimatim

329025
361242
16573
279839
72640
72401


In [15]:
def check_2_consistency(ci_combo, idx1, idx2):
  count_both_non_NA = 0
  count_consistent = 0
  for pmid in ci_combo:
    val_idx1 = ci_combo[pmid][idx1]
    val_idx2 = ci_combo[pmid][idx2]

    if val_idx1 != "" and val_idx2 != "":
      count_both_non_NA += 1
      if val_idx1 == val_idx2:
        count_consistent += 1

  p_consistent = str(count_consistent/count_both_non_NA*100)
  idx_decimal  = p_consistent.find(".")
  p_consistent = f'{p_consistent[:idx_decimal+2]}%'

  return count_both_non_NA, count_consistent, p_consistent

In [16]:
print(check_2_consistency(ci_combo, 3, 4)) # pycountry vs brute force country
print(check_2_consistency(ci_combo, 3, 6)) # pycountry vs brute force subregion
print(check_2_consistency(ci_combo, 3, 7)) # pycountry vs email
print(check_2_consistency(ci_combo, 4, 6)) # brute force country vs subregion
print(check_2_consistency(ci_combo, 4, 7)) # brute force country vs email
print(check_2_consistency(ci_combo, 6, 7)) # brute force subregion vs email


(328312, 318902, '97.1%')
(229221, 97042, '42.3%')
(63600, 59741, '93.9%')
(254596, 101472, '39.8%')
(69993, 64681, '92.4%')
(47119, 22582, '47.9%')


### Determine consistency between brute force results and nominatim

Pycountry is not compared because nominatim run is only done for those without pycountry results.

- brute force country vs nomi: (32930, 15349)
- brute force subregi vs nomi: (50618, 21303)
- email vs nomi: (9040, 4178)

So nominatim results seems to have a very high error rate. Only use this info when no other info is available.

In [17]:
def check_consistency_with_nomi(ci_combo, idx1):
  count_both_non_NA = count_consistent = 0
  for pmid in ci_combo:
    val_idx1 = ci_combo[pmid][idx1]
    val_nomi = ci_combo[pmid][8]

    if val_idx1 != "" and val_nomi != "":
      count_both_non_NA += 1

      # Note that there can be multiple, as long as one is consistent, count it
      if val_idx1 in val_nomi:
        count_consistent += 1

  p_consistent = str(count_consistent/count_both_non_NA*100)
  idx_decimal  = p_consistent.find(".")
  p_consistent = f'{p_consistent[:idx_decimal+2]}%'  

  return count_both_non_NA, count_consistent, p_consistent

In [18]:
print(check_consistency_with_nomi(ci_combo, 4)) # brute force country vs nomi
print(check_consistency_with_nomi(ci_combo, 6)) # brute force subregi vs nomi
print(check_consistency_with_nomi(ci_combo, 7)) # email vs nomi

(32930, 15349, '46.6%')
(50618, 21303, '42.0%')
(9040, 4178, '46.2%')


## ___Establish final a3___

### Considerations

Based on the analysis above, the quality of info, from best to worst, is:
- pycountry and brute force country
- email
- subregion
- nominatim

Rules:
```Python
for a pmid where pyc, bfc, hist, or email is not "":
  if pyc == bfc:                                     # 321553
    set pyc, confidence=3
  elif pyc == email or bfc == email:                 # 34738
    set email, confidence=3
  elif pyc == subr or bfc == subr or email == subr:  # 4852
    set subr, confidence=3
  elif pyc in nomi or bfc in nomi or email in nomi:  # 0, 76, 3236
    set nomi, confidence=3
  elif pyc != "":
    set pyc, confidence=2
  elif bfc != "":
    set bfc, confidence=2
  elif email != "":
    set email, confidence=2
  elif subr in nomi:
    set subr, confidence=2
  elif nomi != "":
    set nomi, confidence=1
  elif subr != "":
    set subr, confidence=1
  else:
    Unknown country
```

Total_with_c: 364465 not_found: 36961

In [19]:
# I don't get why this happens.
for pmid in ci_combo:
  [pyc, bfc, hist, subr, email, nomi] = ci_combo[pmid][3:]
  if pyc != "" or bfc != "" or hist != "" or email != "":
    if len(pyc) == 2 or len(bfc) == 2 or len(hist) == 2 or len(email) == 2:
      print(pmid, [pyc, bfc, hist, email])

25548975 ['NA', '', '', 'DEU']


In [20]:
ci_combo['25548975'][3] = ''

In [21]:
ct1 = ct2 = c_nf1 = c_nf2 = c_nf3 = c_nf4 = 0

for pmid in ci_combo:
  a3   = "na"
  confidence = 0
  [pyc, bfc, hist, subr, email, nomi] = ci_combo[pmid][3:]

  # Must have country info from pyc, bfc, hist, or email
  if pyc != "" or bfc != "" or hist != "" or email != "":
    ct1 += 1
    if pyc == bfc:
      a3   = pyc
      confidence = 3
    elif pyc == email or bfc == email:
      a3 = email
      confidence = 3
    elif pyc == subr or bfc == subr or email == subr:
      a3 = subr
      confidence = 3
    elif pyc in nomi or bfc in nomi or email in nomi:
      if pyc in nomi:
        a3 = pyc
      elif bfc in nomi:
        a3 = bfc
      elif email in nomi:
        a3 = email
      else:
        a3 = subr
      confidence = 3
    elif pyc != "" or bfc != "" or email != "":
      if pyc != "":
        a3 = pyc
      elif bfc != "":
        a3 = bfc
      elif email != "":
        a3 = email
      confidence = 2
    elif subr in nomi:
      a3 = subr
      confidence = 1
    elif nomi != "":
      a3 = nomi
      confidence = 1
    elif subr != "":
      a3 = subr
      confidence=1
    else:
      c_nf2 += 1
  else:
    c_nf1 += 1

  if a3 == "":
    a3 = "na"
    c_nf3 += 1
  elif a3 == "na":
    c_nf4 += 1
  else:
    ct2 += 1

  ci_combo[pmid][2] = [a3, confidence]

In [None]:
print("Total with pyc, bfc, hist, email:", ct1)
print(" With c:", ct1-c_nf2)
print(" W/O c :", c_nf1+c_nf2, "=", c_nf4)
print("a3_final is empty:", c_nf3)
print("a3_final is na:", c_nf4)
print("ci_combo total:", len(ci_combo))


In [None]:
len(ci_combo)

### Generaete output

In [None]:
file_ci_combo = work_dir / 'country_info_combo.pickle'
with open(file_ci_combo, "wb") as f:
  pickle.dump(ci_combo, f)

In [None]:
# PMID /t a3
file_ci_final_a3 = work_dir / 'country_info_final_a3.txt'

with open(file_ci_final_a3, "w") as f:
  c_f  = 0
  c_nf = 0
  f.write('PMID\tA3\tConfidence\n')
  for pmid in ci_combo:
    [a3, conf] = ci_combo[pmid][2]
    if a3 != "na":
      f.write(f"{pmid}\t{a3}\t{conf}\n")
      c_f += 1
    else:
      c_nf += 1

print(c_f, c_nf)