# __Step 7.3: Brute force search country names and email__

Goal:
- After using pycountry and nominatim, some countries are still not found for various reasons. For those with country name in the address string, will try to find them through brute force search.
- Also get country code top level domain (ccTLD) info to find countries.

Considerations:
- The search is very fast, also realize that this will be more accurate than the Nominatim search so decide that, when consolidate results, the order of preference will be:
  - pycountry search in 7_1
  - brute force search in 7_1e
  - consolidated nominatim results from 7_1d
- Because of the above, the brute-force search will be applied to ALL records.

## ___Setup___

### Module import

In conda env `base`

In [1]:
import pickle, pycountry, re
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
from multiprocessing import Pool
from tqdm import tqdm

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
base_dir   = proj_dir / "7_countries"

work_dir   = base_dir / "7_3_brute_force_and_email"
work_dir.mkdir(parents=True, exist_ok=True)

dir71      = base_dir / "7_1_parse_countries"

# all records' pmid, AU, AD info
dict_pmid_au_ad_file = dir71 / "dict_pmid_AU_AD.pickle"

# consolidated nomi out file
# NOT DEFINED

# So PDF is saved in a format properly
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams["font.family"] = "sans-serif"

## ___Country info___

### Get current country code

Add a space in front of a3 names to reduce FP results, e.g.:
- 11432923 
  - "Centro de Investigacion sobre Fijacion de Nitrogeno, UNAM, Apartado Postal 565-A, Cuernavaca, Mor. Mexico." - Found "NAM"
- 11447000
  - 'UNIOESTE-Centro de Ciencias Agrarias, Rua Pernambuco 1777, 85960-000 Mal.Candido Rondon, PR Brazil.' - Found "EST".

In [3]:
countries = list(pycountry.countries)
cnames    = {}  # {name:name_a3} where name can be short or official name
a2_a3     = {}  # {a2:a3}, this is for subregion search

for country in countries:
  name_a2    = country.alpha_2
  name_a3    = country.alpha_3
  a2_a3[name_a2] = name_a3

  name_short = country.name

  cnames[" " + name_a3]    = name_a3
  cnames[name_short] = name_a3

  try:
    name_offic = country.official_name
    cnames[name_offic] = name_a3
  except AttributeError:
    #print("No official name:", name_short)
    pass

# Add a special case
cnames["Republic of China"] = "TWN"
cnames["Taiwan"] = "TWN"
cnames[" UK"] = "GBR"
cnames["Russia"] = "RUS"
cnames["Hong Kong"] = "HKG"


### Get past country code

See [this table](https://www.statcan.gc.ca/en/subjects/standard/sccai/2011/scountry-desc)

In [4]:
# Conversion for couple renamed countries and Taiwan
cnames_convert = {"BUR": "MMR", "ZAR": "COD"}

In [5]:
countries_hist = list(pycountry.historic_countries)
hnames         = {}  # {historical country name: a3}

for country in countries_hist:
  # the name in historical countries are the official names
  name_a3    = country.alpha_3
  name_offic = country.name
  name_short = name_offic.split(",")[0]

  if name_a3 in cnames_convert:
    name_a3 = cnames_convert[name_a3]

  hnames[" " + name_a3]    = name_a3
  hnames[name_offic] = name_a3
  hnames[name_short] = name_a3

### Get subregion to country info

In [6]:
subregion_list = list(pycountry.subdivisions)
snames         = {} # {subregion_name: a3}

for subregion in subregion_list:
  name_a2    = subregion.country_code
  name_a3    = a2_a3[name_a2]
  name_short = subregion.name 

  # Reqiure a space before
  snames[" " + name_short] = name_a3

### Read all records' pmid, AU, AD info

In [53]:
# {pmid:[AU, AD]}

with open(dict_pmid_au_ad_file, 'rb') as f:
  dict_pmid_au_ad = pickle.load(f)

In [54]:
len(dict_pmid_au_ad)

421276

In [55]:
for idx, pmid in enumerate(dict_pmid_au_ad):
  if idx == 3000:
    break

  if dict_pmid_au_ad[pmid][1] != "NA":
    print(pmid, dict_pmid_au_ad[pmid])

400957 [['Davidson WS', 'Walton DJ', 'Flynn TG'], ["Department of Biochemistry, Queen's University, Kingston, Ontario, Canada."]]
803110 [['Sagone AL Jr', 'Balcerzak SP', 'Metz EN'], ['Division of Hematology and Oncology, Ohio State University College of Medicine, Columbus 43210.']]
1279107 [['Moser O', 'Fuchs M', 'Pinck L', 'Stussi-Garaud C'], ['Institut de Biologie Moleculaire des Plantes du C.N.R.S. et Universite Louis Pasteur, Laboratoire de Virologie, Strasbourg, France.']]


### Add a dictionary to dict_pmid_au_ad

In [56]:
for pmid in dict_pmid_au_ad:
  if len(dict_pmid_au_ad[pmid]) != 2:
    print("ERR:", pmid, dict_pmid_au_ad[pmid])
    break

In [57]:
for pmid in dict_pmid_au_ad:
  dict_pmid_au_ad[pmid].append(
    {"country":"", "country_hist":"", "subregion":"", "email":""})

## ___Brute force search___

### Function

In [58]:
def find_country(add_str, ndict):
  '''
  Args:
    add_str (str): 1st author address
    ndict (dict): name dictionary {a3, short, or official name: a3}, can be
      for current/historical countries or subregions.
  return:
    a3_first (str): a3 country code for the 1st qualified name encountered
  '''
  # Search against current countries
  idxs = {} # {str_idx_where_country_is_found: [country, a3_code]}
  for country in ndict:
    if country in add_str:
      idxs[add_str.find(country)] = [country, ndict[country]]

  # For the 1st match
  a3_first = ""
  if idxs != {}:
      # first one found
    sorted_idxs = list(idxs.keys())
    sorted_idxs.sort()
    [_, a3_first] = idxs[sorted_idxs[0]]

  return a3_first

### Against current country

In [60]:
# Modify dict_pmid_au_ad to include a dictionary as the third element
#   {pmid:[AU, AD, 
#          {"country":XX, "country_hist":XX, "subregion":XX, "email":XX}]}
found_curr = 0
for pmid in tqdm(dict_pmid_au_ad):

  AD = dict_pmid_au_ad[pmid][1]
  if AD != "NA":
    add_str = AD[0]
    a3 = find_country(add_str, cnames)
    if a3 != "":
      found_curr += 1
      #print(a3)

    # populate dict
    dict_pmid_au_ad[pmid][2]["country"] = a3
  
print(found_curr)

100%|██████████| 421276/421276 [00:25<00:00, 16253.66it/s]

361242





In [61]:
dict_pmid_au_ad[pmid][2]

{'country': 'USA', 'country_hist': '', 'subregion': '', 'email': ''}

### Against historical country

In [62]:
found_hist = 0
for pmid in tqdm(dict_pmid_au_ad):

  AD = dict_pmid_au_ad[pmid][1]
  if AD != "NA":
    add_str = AD[0]
    a3      = find_country(add_str, hnames)
    if a3 != "":
      found_hist += 1
    # populate dict
    dict_pmid_au_ad[pmid][2]["country_hist"] = a3

print(found_hist)

100%|██████████| 421276/421276 [00:03<00:00, 130493.42it/s]

16573





In [67]:
for pmid in dict_pmid_au_ad:
  if dict_pmid_au_ad[pmid][2]['country_hist'] != '':
    print(dict_pmid_au_ad[pmid][2])
    break

{'country': 'FRA', 'country_hist': 'FXX', 'subregion': '', 'email': ''}


### Against subregions

In [64]:
found_subr = 0
for pmid in tqdm(dict_pmid_au_ad):

  AD = dict_pmid_au_ad[pmid][1]
  if AD != "NA":
    add_str = AD[0]
    a3      = find_country(add_str, snames)
    if country != "":
      found_subr += 1
    # populate dict
    dict_pmid_au_ad[pmid][2]["subregion"] = a3

print(found_subr)

100%|██████████| 421276/421276 [02:50<00:00, 2474.59it/s]

401426





In [69]:
for pmid in dict_pmid_au_ad:
  if dict_pmid_au_ad[pmid][2]['subregion'] != '':
    print(dict_pmid_au_ad[pmid][2])
    print(dict_pmid_au_ad[pmid][1])
    break

{'country': 'CAN', 'country_hist': '', 'subregion': 'JAM', 'email': ''}
["Department of Biochemistry, Queen's University, Kingston, Ontario, Canada."]


## ___Get country info based on emails___

- [Extract email from text](https://www.tutorialspoint.com/python_text_processing/python_extract_emails_from_text.htm)
- [Python email2country](https://pypi.org/project/email2country/): this relies on web services that can break when it cannot find things (e.g., use xyz@google.com)
- [Get top level domain to country mapping](https://en.wikipedia.org/wiki/Country_code_top-level_domain)
  - [Wikipedia ccTLD](https://en.wikipedia.org/wiki/Country_code_top-level_domain): This is problematic because the name is not in English so many are not found in ISO 3166.
  - [ICANN wiki](https://icannwiki.org/Country_code_top-level_domain): This is also problematic because a lot of country names are not consistent with ISO 3166 names (e.g., Republic of Congo in this doc is Republic of the Congo in pycountry)
  - [Wikipedia ISO 3166](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes): This page has country code AND ccTLD.


### Get country-code top-level domains

In [70]:
# Based on
#https://medium.com/aiguys/how-to-scrape-wikipedia-tables-straight-to-pandas-dataframe-in-3-lines-of-code-752e98a9e815
html   = "https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes"
tables = pd.read_html(html)

In [71]:
table_ccTLD = tables[0]
table_ccTLD.columns = ["Country", "Official", "Sovereignty", "Alpha-2", 
                       "Alpha-3", "Numeric", "Subdivision", "ccTLD"]
table_ccTLD.head(3)

Unnamed: 0,Country,Official,Sovereignty,Alpha-2,Alpha-3,Numeric,Subdivision,ccTLD
0,Afghanistan,The Islamic Republic of Afghanistan,UN member state,.mw-parser-output .monospaced{font-family:mono...,AFG,4,ISO 3166-2:AF,.af
1,Åland Islands,Åland,Finland,AX,ALA,248,ISO 3166-2:AX,.ax
2,Albania,The Republic of Albania,UN member state,AL,ALB,8,ISO 3166-2:AL,.al


In [72]:
dict_tld = {} # {tld: country}
tlds     = table_ccTLD["ccTLD"].tolist()
a3s      = table_ccTLD['Alpha-3'].tolist()

for idx, tld in enumerate(tlds):

  if tld.startswith("."):
    # Two cases '.bq .nl...' and '.gb .uk...' Deal with the 2nd case only
    if len(tld) > 3:
      print(tld) 
      if tld.startswith('.gb'):
        tld = '.uk'
      
    a3 = a3s[idx]
    dict_tld[tld] = a3

# Add domains
dict_tld[".edu"] = "USA"
dict_tld[".gov"] = "USA"
dict_tld[".mil"] = "USA"

.bq .nl [d]
.gb .uk [ad]


In [73]:
dict_tld['.kp'], dict_tld['.tw']

('PRK', 'TWN')

### Functions

In [74]:
# Based on:
#https://www.tutorialspoint.com/python_text_processing/python_extract_emails_from_text.htm
def email_to_country(add_str):
  '''
  Args:
    add_str (str): 1st author address
  return:
    a3 (str): alpha-3 code of country
    ccTLD (str): country code top level domain of the entry found in dict_tld
      or if not in dict_tld, the ccTLD of the last email in the list
    emails (str): all recovered email
  '''
  emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", add_str)
  a3     = ""
  ccTLD  = ""
  if emails != []:
    # Get a3 for 1st email with country info
    for email in emails:
      ccTLD = email[email.rfind("."):]
      if ccTLD in dict_tld:
        a3 = dict_tld[ccTLD]
        break

  return a3, ccTLD, emails


### Get country from email

In [75]:
found_email = 0
found_qua   = 0  # not only has email but has country info
dict_emails = {} # {pmid: emails}
tld_no_a3   = {} # {tld: count}
for pmid in tqdm(dict_pmid_au_ad):

  AD = dict_pmid_au_ad[pmid][1]  
  if AD != "NA":
    add_str    = AD[0]
    a3, tld, emails = email_to_country(add_str)
    if emails != []:
      found_email += 1
      dict_emails[pmid] = [tld, emails]
      
      if a3 != "":
        found_qua += 1
      else:
        if tld not in tld_no_a3:
          tld_no_a3[tld] = [1,{pmid:emails}]
        else:
          tld_no_a3[tld][0] += 1
          tld_no_a3[tld][1] = {pmid:emails}

    dict_pmid_au_ad[pmid][2]["email"] = a3

print(found_email, found_qua)

100%|██████████| 421276/421276 [00:02<00:00, 180083.71it/s]

90799 72640





In [76]:
for pmid in dict_pmid_au_ad:
  if dict_pmid_au_ad[pmid][2]['email'] != '':
    print(dict_pmid_au_ad[pmid][2])
    break

{'country': 'JPN', 'country_hist': '', 'subregion': '', 'email': 'JPN'}


In [77]:
for tld in tld_no_a3:
  counts = tld_no_a3[tld][0]
  for pmid in tld_no_a3[tld][1]:
    emails = tld_no_a3[tld][1][pmid]
    print("---")
    print(tld, counts, emails[:2])
    print(pmid, dict_pmid_au_ad[pmid][1][0])
    break

---
.aut 1 ['anna.koltunow@adl.hort.csiro.aut']
8742336 Division of Horticulture, Commonwealth Scientific and Industrial Research Organization, Adelaide, South Australia anna.koltunow@adl.hort.csiro.aut.
---
.com 16275 ['nirmalbabu30@hotmail.com']
27108333 All India Coordinated Research Project on Spices, Indian Institute of Spices Research, Kozhikode, 673 012, Kerala, India. nirmalbabu30@hotmail.com.
---
.ip 2 ['oikawa@biology.tohoku.ac.ip']
16945093 Graduate School of Life Sciences, Tohoku University, 6-3 Aoba, Sendai 980-8578, Japan. oikawa@biology.tohoku.ac.ip
---
.j 2 ['shirasaw@kazusa.or.j']
22294450 Department of Plant Genome Research, Kazusa DNA Research Institute, 2-6-7 Kazusa-Kamatari, Kisarazu, Chiba 292-0818, Japan. shirasaw@kazusa.or.j
---
.org 1107 ['t.carruthers@kew.org']
33352127 Royal Botanic Gardens Kew, Richmond, London TW9 3AE, UK. Electronic address: t.carruthers@kew.org.
---
.biologie 2 ['arhah@botanik.biologie']
15342780 Department of Plant Biology, Carnegie Inst

## ___Export results___

In [78]:
# {pmid: [ccTLD, emails]}

dict_emails_file = work_dir / "dict_pmid_ccTLD_emails.pickle"
with open(dict_emails_file, "wb") as f:
  pickle.dump(dict_emails, f)

In [79]:
# dict_pmid_au_ad populated after brute force and email search

dict_pmid_au_ad_BRUTE_file = work_dir / "dict_pmid_au_ad_BRUTE.pickle"
with open(dict_pmid_au_ad_BRUTE_file, "wb") as f:
  pickle.dump(dict_pmid_au_ad, f)

## ___Testing___

### Test brute force search

Look at the test results, most have:
- No country info but with city and zip codes
- Mispelling
- Fused words
- Only insitutional info

Some of the above have email address. So should explore using email address to parse info also.

In [None]:
# Read test file
test_file = dir71 / "country_info-nominatim_as_NF.pickle"
with open(test_file, "rb") as f:
  test_nf = pickle.load(f)

In [None]:
# Get 1000 records
test_nf_1k = {}
for idx, pmid in enumerate(test_nf):
  if idx == 1e3:
    break
  test_nf_1k[pmid] = test_nf[pmid]

In [None]:

for pmid in tqdm(test_nf_1k):
  add_str = test_nf_1k[pmid][1][0]
  country = find_country(add_str, cnames)
  if country == "":
    print(pmid, [add_str])

100%|██████████| 1000/1000 [00:00<00:00, 12238.21it/s]

10077500 ['Department of Plant Biology, Southern Illinois University, Carbondale, Illinois 62901-6509.']
10318702 ['Laboratoire de Biogenese Membranaire, Unite Mixte de Recherche-5544-Centre National de la Recherche Scientifique (CNRS) (B.S.-B., P.V., L.M.-P., C.C., P.M.).']
10447885 ['Host-Parasite Interactions Section, Laboratory of Intracellular Parasites.']
10465386 ['Centre for Reproductive Biology, Swedish University of Agricultural Sciences, Uppsala. Asheber.Swealem@bbsrc.ac.uk']
10469156 ['Institut de Biologie Moleculaire des Plantes, CNRS, Strasbourg, Frnace.']
10474289 ['Katedra i Zaklad Bromatologii, Akademia Medyczna we Wroclawiu.']
10476083 ['Department of Forest Genetics and Plant Physiology, Swedish University of Agricultural Sciences, Umea.']
10480390 ['Department of Forest Genetics, Uppsala Genetic Centre, Swedish University of Agricultural Sciences.']
10480393 ['Department of Plant Biology, Swedish University of Agricultural Sciences, Uppsala.']
10482013 ['Department 




### Test email2country

In [64]:
from email2country import email2country

email2country('shius@msu.edu')

'United States'

In [65]:
email2country('xyz@blah.hk')

'Hong Kong'

In [67]:
email2country('xyz@google.com')

SSLError: HTTPSConnectionPool(host='ipvigilante.com', port=443): Max retries exceeded with url: /142.251.32.14/country_iso_code (Caused by SSLError(CertificateError("hostname 'ipvigilante.com' doesn't match either of 'cloudflare-dns.com', '*.cloudflare-dns.com', 'one.one.one.one', '1.0.0.1', '1.1.1.1', '162.159.36.1', '162.159.46.1', '2606:4700:4700:0:0:0:0:1001', '2606:4700:4700:0:0:0:0:1111', '2606:4700:4700:0:0:0:0:64', '2606:4700:4700:0:0:0:0:6400'")))

### Country name containing Congo

In [None]:
for country in cnames:
    if country.find("Congo") != -1:
        print(country)

Congo, The Democratic Republic of the
Congo
Republic of the Congo


In [None]:
# No longer needed
# Originally parsing from:
#https://icannwiki.org/Country_code_top-level_domain

# Treat row 1 as column names
#https://stackoverflow.com/questions/52516199/convert-first-row-of-pandas-dataframe-to-column-name
#https://sparkbyexamples.com/pandas/pandas-drop-rows-from-dataframe/

table_ccTLD.columns = table_ccTLD.iloc[0]
table_ccTLD = table_ccTLD.drop([0])
table_ccTLD.head()

Unnamed: 0,ccTLD,Entity,Registry Operator,Notes
1,.ac,Ascension Island,Nic.ac,
2,.ad,Andorra,Andorra Telecom,
3,.ae,United Arab Emirates,UAEnic,
4,.af,Afghanistan,AfgNIC,
5,.ag,Antigua and Barbuda,Nic AG,
