# Use the address and university to find the country

### import the required libraries

In [29]:
from geopy.geocoders import Nominatim
from time import time, sleep
import pandas as pd

### Declaring a list of countries for quick matching

In [30]:
# List of base countries
base_countries = {
    'Republic of Korea': 'South Korea',
    'Taiwan': 'China', 'Hong Kong': 'China', 'UAE': 'United Arab Emirates',
    'New York': 'United States', 'USA': 'United States', 'UK': 'United Kingdom', 
    
    'United States': 'United States', 'Germany': 'Germany', 'Italy': 'Italy', 
    'United Kingdom': 'United Kingdom', 'France': 'France', 'Canada': 'Canada', 
    'Brazil': 'Brazil', 'Australia': 'Australia', 'Spain': 'Spain', 'Russia': 'Russia', 
    'India': 'India', 'China': 'China', 'Japan': 'Japan', 'South Korea': 'South Korea', 
    'Mexico': 'Mexico', 'Netherlands': 'Netherlands', 'Switzerland': 'Switzerland', 
    'Sweden': 'Sweden', 'Belgium': 'Belgium', 'Austria': 'Austria', 'Norway': 'Norway', 
    'Denmark': 'Denmark', 'Finland': 'Finland', 'Greece': 'Greece', 'Portugal': 'Portugal', 
    'New Zealand': 'New Zealand', 'Ireland': 'Ireland', 'Poland': 'Poland', 
    'Czech Republic': 'Czech Republic', 'Turkey': 'Turkey', 'South Africa': 'South Africa', 
    'Argentina': 'Argentina', 'Hungary': 'Hungary', 'Chile': 'Chile', 'Singapore': 'Singapore', 
    'Israel': 'Israel', 'Thailand': 'Thailand', 'Malaysia': 'Malaysia', 'Indonesia': 'Indonesia', 
    'Ukraine': 'Ukraine', 'Vietnam': 'Vietnam', 'Philippines': 'Philippines', 'Egypt': 'Egypt', 
    'Saudi Arabia': 'Saudi Arabia', 'United Arab Emirates': 'United Arab Emirates', 
    'Pakistan': 'Pakistan', 'Iran': 'Iran', 'Iraq': 'Iraq', 'Kuwait': 'Kuwait', 'Qatar': 'Qatar', 
    'Nigeria': 'Nigeria', 'Kenya': 'Kenya', 'Morocco': 'Morocco', 'Algeria': 'Algeria', 
    'Tunisia': 'Tunisia', 'Ghana': 'Ghana', 'Ethiopia': 'Ethiopia', 'Cameroon': 'Cameroon', 
    'Uganda': 'Uganda', 'Senegal': 'Senegal', 'Zimbabwe': 'Zimbabwe', 'Zambia': 'Zambia', 
    'Tanzania': 'Tanzania', 'Botswana': 'Botswana', 'Namibia': 'Namibia', 'Mauritius': 'Mauritius', 
    'Madagascar': 'Madagascar', 'Angola': 'Angola', 'Mozambique': 'Mozambique', 'Rwanda': 'Rwanda', 
    'Ivory Coast': 'Ivory Coast', 'Burkina Faso': 'Burkina Faso', 'Uzbekistan': 'Uzbekistan', 
    'Kazakhstan': 'Kazakhstan', 'Turkmenistan': 'Turkmenistan', 'Azerbaijan': 'Azerbaijan', 
    'Georgia': 'Georgia', 'Armenia': 'Armenia', 'Belarus': 'Belarus', 'Moldova': 'Moldova', 
    'Kyrgyzstan': 'Kyrgyzstan', 'Tajikistan': 'Tajikistan', 'Latvia': 'Latvia', 
    'Lithuania': 'Lithuania', 'Estonia': 'Estonia', 'Slovakia': 'Slovakia', 'Slovenia': 'Slovenia', 
    'Croatia': 'Croatia', 'Bulgaria': 'Bulgaria', 'Romania': 'Romania', 'Serbia': 'Serbia', 
    'Bosnia and Herzegovina': 'Bosnia and Herzegovina', 'Montenegro': 'Montenegro', 
    'North Macedonia': 'North Macedonia', 'Albania': 'Albania', 'Kosovo': 'Kosovo', 
    'Cyprus': 'Cyprus', 'Malta': 'Malta', 'Iceland': 'Iceland', 'Luxembourg': 'Luxembourg', 
    'Liechtenstein': 'Liechtenstein', 'Monaco': 'Monaco', 'Andorra': 'Andorra', 'San Marino': 'San Marino', 
    'Vatican': 'Vatican'
}

### Define function to find country

In [31]:
from geopy.exc import GeocoderUnavailable

def get_country_from_list(address):
    for key_country, country in base_countries.items():
        if key_country.lower() in address.lower():
            return country #+ "*"
    return None

def get_country(address):
    country = get_country_from_list(address)
    if country: return country

    # If not, use geocoder to get the cosuntry
    geolocator = Nominatim(user_agent="address_to_country_converter")
    for _ in range(3): # Number of retries
        try:
            location = geolocator.geocode(address, language="en")
            return location.address.split(",")[-1].strip() if location else None
        except GeocoderUnavailable:
            # If geocoder is unavailable, wait and then retry
            print("Sleep")
            sleep(1)
    print("Service unavailable")
    return None

## Read dataset

In [32]:
import unicodedata

df = pd.read_csv('slam_articles.csv')
publishers = set(df["publisher"])
publisher_locations = set(df["publisher_location"])
universities = set(df["publisher_affiliation"])

# fix string text:
publishers = {str(pub) for pub in publishers if pub}
publisher_locations = {str(loc) for loc in publisher_locations if loc}
universities = {str(uni) for uni in universities if uni}

# remove bad values
for cur_set in [publishers, publisher_locations, universities]:
    cur_set.discard('nan')
    cur_set.discard(',')
    cur_set.discard('Unknown')
    cur_set.discard('unknown')

print(f"publishers count: {len(publishers)}")
print(f"publisher locations count: {len(publisher_locations)}")
print(f"universities count: {len(universities)}")

publisher count: 1561
publisher locations count: 104
universities count: 7680


### Find countries for publishers (Use only base list)

In [34]:
publishers_dict = {}

N = len(publishers)
start = time()
for i, locat in enumerate(publishers):
    if i % 20 == 0:
        print(f"\r{i}/{N} ({(100*i/N):.1f}%): {(time() - start):.3f} sec ",end='')
    publishers_dict[locat] = get_country_from_list(locat)
print(f"\r{N}/{N} (100.0%): {(time() - start):.3f} sec ")

1561/1561 (100.0%): 0.099 sec 


In [50]:
for pub, loc in list(publishers_dict.items())[10:15]:
    print(f"{pub}:\n\t{loc if loc else '-'}")

Informa UK Limited:
	United Kingdom
Universidade da Beira Interior:
	-
Politeknik Negeri Bandung:
	-
Korean Geomorphological Association:
	-
Ghana Institution of Engineering:
	Ghana


### Find countries for publisher_locations

In [47]:
publisher_locations_dict = {}

N = len(publisher_locations)
start = time()
for i, locat in enumerate(publisher_locations):
    if i % 20 == 0:
        print(f"\r{i}/{N} ({(100*i/N):.1f}%): {(time() - start):.3f} sec ",end='')
    publisher_locations_dict[locat] = get_country(locat)
print(f"\r{N}/{N} (100.0%): {(time() - start):.3f} sec ")

104/104 (100.0%): 36.070 sec 


In [49]:
for pub, loc in list(publisher_locations_dict.items())[:5]:
    print(f"{pub}:\n\t{loc if loc else '-'}")

Botucatu, São Paulo, Brasil:
	Brazil
Reston, Virginia:
	United States
Southampton UK:
	United Kingdom
Fort Collins, CO:
	United States
Boston:
	United States


### Find countries for universities

In [51]:
universities_dict = {}

N = len(universities)
start = time()
for i, locat in enumerate(universities):
    if i % 20 == 0:
        print(f"\r{i}/{N} ({(100*i/N):.1f}%): {(time() - start):.3f} sec ",end='')
    universities_dict[locat] = get_country(locat)
print(f"\r{N}/{N} (100.0%): {(time() - start):.3f} sec ")

220/7680 (2.9%): 26.938 sec Sleep
Sleep
Sleep
Service unavailable
460/7680 (6.0%): 71.320 sec Sleep
Sleep
Sleep
Service unavailable
2260/7680 (29.4%): 310.764 sec Sleep
Sleep
Sleep
Service unavailable
3800/7680 (49.5%): 524.334 sec Sleep
Sleep
Sleep
Service unavailable
Sleep
Sleep
Sleep
Service unavailable
3900/7680 (50.8%): 561.292 sec Sleep
Sleep
Sleep
Service unavailable
3980/7680 (51.8%): 581.561 sec Sleep
Sleep
Sleep
Service unavailable
4460/7680 (58.1%): 649.857 sec Sleep
Sleep
Sleep
Service unavailable
4560/7680 (59.4%): 676.990 sec Sleep
Sleep
Sleep
Service unavailable
6340/7680 (82.6%): 937.562 sec Sleep
Sleep
Sleep
Service unavailable
7320/7680 (95.3%): 1069.974 sec Sleep
7680/7680 (100.0%): 1117.468 sec 


In [58]:
for pub, loc in list(universities_dict.items())[45:50]:
    print(f"{pub}:\n\t{loc if loc else '-'}")

Tropics and Subtropics Group, Institute of Agricultural Engineering, University of Hohenheim, 70599 Stuttgart, Germany:
	Germany
Ecole Nationale de l'Aviation Civile:
	France
Jilin University, College of Computer Science and Technology, Changchun, China:
	China
University of Castilla-La Mancha, Spain:
	Spain
School of Electrical Engineering and KI-AI, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea:
	South Korea


### Check not None countries

In [60]:
not_none = 0
for location in universities_dict.values():
    if location: not_none+=1

print(not_none,'/',len(universities_dict),f"({(100*not_none/len(universities_dict)):.1f}%)")

6497 / 7680 (84.6%)


In [62]:
print(set(publishers_dict.values()))
print(set(publisher_locations_dict.values()))
print(set(universities_dict.values()))

{'Germany', 'Cyprus', 'Slovenia', 'Azerbaijan', 'Bulgaria', 'Georgia', 'India', 'Croatia', 'Australia', None, 'Vietnam', 'South Africa', 'Turkey', 'Japan', 'Malaysia', 'Austria', 'Estonia', 'Romania', 'Belarus', 'Finland', 'Argentina', 'Belgium', 'Serbia', 'China', 'Netherlands', 'Switzerland', 'Ghana', 'Iran', 'Norway', 'Canada', 'Latvia', 'Poland', 'Mexico', 'Indonesia', 'Singapore', 'Egypt', 'United States', 'Chile', 'New Zealand', 'Denmark', 'Russia', 'Lithuania', 'Brazil', 'United Kingdom'}
{'Germany', 'India', 'Italy', 'Australia', None, 'Turkey', 'Japan', 'Hungary', 'Austria', 'Portugal', 'Romania', 'Finland', 'Spain', 'Belgium', 'Netherlands', 'Switzerland', 'France', 'Canada', 'Sweden', 'Singapore', 'United States', 'Russia', 'Lithuania', 'Brazil', 'United Kingdom'}
{'Azerbaijan', 'Tunisia', 'Georgia', 'Cameroon', 'India', None, 'Kazakhstan', 'Japan', 'Estonia', 'Belarus', 'Argentina', 'Ghana', 'Algeria', 'Poland', 'Indonesia', 'Iraq', 'Slovakia', 'Panama', 'Sierra Leone', 'Ni

### Save to file

In [63]:
import json

with open('publishers_dict.json', "w") as file:
    json.dump(publishers_dict, file)

with open('publisher_locations_dict.json', "w") as file:
    json.dump(publisher_locations_dict, file)

with open('universities_dict.json', "w") as file:
    json.dump(universities_dict, file)