In [12]:
from geopy.geocoders import Nominatim
from time import time, sleep
import pandas as pd

In [13]:
# List of base countries
base_countries = {
    'Hong Kong': 'China', 'UAE': 'United Arab Emirates',
    'New York': 'United States', 'USA': 'United States', 'UK': 'United Kingdom', 
    'United States': 'United States', 'Germany': 'Germany', 'Italy': 'Italy', 
    'United Kingdom': 'United Kingdom', 'France': 'France', 'Canada': 'Canada', 
    'Brazil': 'Brazil', 'Australia': 'Australia', 'Spain': 'Spain', 'Russia': 'Russia', 
    'India': 'India', 'China': 'China', 'Japan': 'Japan', 'South Korea': 'South Korea', 
    'Mexico': 'Mexico', 'Netherlands': 'Netherlands', 'Switzerland': 'Switzerland', 
    'Sweden': 'Sweden', 'Belgium': 'Belgium', 'Austria': 'Austria', 'Norway': 'Norway', 
    'Denmark': 'Denmark', 'Finland': 'Finland', 'Greece': 'Greece', 'Portugal': 'Portugal', 
    'New Zealand': 'New Zealand', 'Ireland': 'Ireland', 'Poland': 'Poland', 
    'Czech Republic': 'Czech Republic', 'Turkey': 'Turkey', 'South Africa': 'South Africa', 
    'Argentina': 'Argentina', 'Hungary': 'Hungary', 'Chile': 'Chile', 'Singapore': 'Singapore', 
    'Israel': 'Israel', 'Thailand': 'Thailand', 'Malaysia': 'Malaysia', 'Indonesia': 'Indonesia', 
    'Ukraine': 'Ukraine', 'Vietnam': 'Vietnam', 'Philippines': 'Philippines', 'Egypt': 'Egypt', 
    'Saudi Arabia': 'Saudi Arabia', 'United Arab Emirates': 'United Arab Emirates', 
    'Pakistan': 'Pakistan', 'Iran': 'Iran', 'Iraq': 'Iraq', 'Kuwait': 'Kuwait', 'Qatar': 'Qatar', 
    'Nigeria': 'Nigeria', 'Kenya': 'Kenya', 'Morocco': 'Morocco', 'Algeria': 'Algeria', 
    'Tunisia': 'Tunisia', 'Ghana': 'Ghana', 'Ethiopia': 'Ethiopia', 'Cameroon': 'Cameroon', 
    'Uganda': 'Uganda', 'Senegal': 'Senegal', 'Zimbabwe': 'Zimbabwe', 'Zambia': 'Zambia', 
    'Tanzania': 'Tanzania', 'Botswana': 'Botswana', 'Namibia': 'Namibia', 'Mauritius': 'Mauritius', 
    'Madagascar': 'Madagascar', 'Angola': 'Angola', 'Mozambique': 'Mozambique', 'Rwanda': 'Rwanda', 
    'Ivory Coast': 'Ivory Coast', 'Burkina Faso': 'Burkina Faso', 'Uzbekistan': 'Uzbekistan', 
    'Kazakhstan': 'Kazakhstan', 'Turkmenistan': 'Turkmenistan', 'Azerbaijan': 'Azerbaijan', 
    'Georgia': 'Georgia', 'Armenia': 'Armenia', 'Belarus': 'Belarus', 'Moldova': 'Moldova', 
    'Kyrgyzstan': 'Kyrgyzstan', 'Tajikistan': 'Tajikistan', 'Latvia': 'Latvia', 
    'Lithuania': 'Lithuania', 'Estonia': 'Estonia', 'Slovakia': 'Slovakia', 'Slovenia': 'Slovenia', 
    'Croatia': 'Croatia', 'Bulgaria': 'Bulgaria', 'Romania': 'Romania', 'Serbia': 'Serbia', 
    'Bosnia and Herzegovina': 'Bosnia and Herzegovina', 'Montenegro': 'Montenegro', 
    'North Macedonia': 'North Macedonia', 'Albania': 'Albania', 'Kosovo': 'Kosovo', 
    'Cyprus': 'Cyprus', 'Malta': 'Malta', 'Iceland': 'Iceland', 'Luxembourg': 'Luxembourg', 
    'Liechtenstein': 'Liechtenstein', 'Monaco': 'Monaco', 'Andorra': 'Andorra', 'San Marino': 'San Marino', 
    'Vatican': 'Vatican'
}

In [14]:
from geopy.exc import GeocoderUnavailable

def get_country(address):
    geolocator = Nominatim(user_agent="address_to_country_converter")

    # Check if address contains any base country
    for key_country, country in base_countries.items():
        if key_country.lower() in address.lower():
            return country #+ "*"
        
    # If not, use geocoder to get the cosuntry
    for _ in range(3): # Number of retries
        try:
            location = geolocator.geocode(address, language="en")
            if location:
                return location.address.split(",")[-1].strip()
            else:
                return None
        except GeocoderUnavailable:
            # If geocoder is unavailable, wait and then retry
            print("Sleep")
            sleep(1) # Delay in seconds between retries
    print("Service unavailable")
    return None  # Return None if unable to retrieve location after retries

# Take Initial dataset

In [34]:
df = pd.read_csv('slam_articles.csv')
locations = set(df["publisher_location"])
universities = set(df["publisher_affiliation"])

# fix string text:
locations = {str(location).replace("\n", " ").replace("\xa0", " ") for location in locations}
universities = {str(univ).replace("\n", " ").replace("\xa0", " ") for univ in universities}

# TODO remove nan from set
# TODO remove Unknown from set

print(f"locations count: {len(locations)}")
print(f"universities count: {len(universities)}")

    

locations count: 106
universities count: 7710


In [36]:
# Convert cities to countries
start = time()
location_countries = []
for i, locat in enumerate(locations):
    if i % 20 == 0:
        print(f"\r{i}/{len(locations)} ({(100*i/len(locations)):.1f}%): {(time() - start):.3f} sec ",end='')
    location_countries.append(get_country(locat))
print(f"\r{len(locations)}/{len(locations)} (100.0%): {(time() - start):.3f} sec ")

106/106 (100.0%): 35.694 sec 


In [38]:
for i, locat in enumerate(locations):
    print(f"{locat}:\n\t{location_countries[i]}")

1 Oliver's Yard, 55 City Road, London EC1Y 1SP United Kingdom :
	United Kingdom
Edinburgh, United Kingdom:
	United Kingdom
Dordrecht:
	Netherlands
Recife, Brasil:
	Brazil
New York, NY, USA:
	United States
Trieste, Italy:
	Italy
Urbana, Illinois:
	United States
Freiburg, Germany:
	Germany
Heidelberg:
	Germany
Göttingen:
	Germany
Istanbul:
	Turkey
nan:
	Italy
New York, NY: Routledge, 2020.:
	United States
Marabá, Pará:
	Brazil
Linköping:
	Sweden
Stroudsburg, PA, USA:
	United States
New York, NY:
	United States
Washington, D.C.:
	United States
Cham:
	Germany
London United Kingdom :
	United Kingdom
The Netherlands:
	Netherlands
Les Ulis, France:
	France
Routledge, 2 Park Square, Milton Park, Abingdon, Oxon OX14 4RN and by Routledge, 711 Third Avenue, New York, NY 10017:
	United States
St. Joseph, MI:
	United States
Stafa:
	United Kingdom
Basel, Switzerland:
	Switzerland
Berlin:
	Germany
Washington DC:
	United States
New York, New York, USA:
	United States
Köln:
	Germany
Washington, DC:
	Un

In [39]:
# Convert universiy to countries
start = time()
university_countries = []
for i, locat in enumerate(universities):
    if i % 20 == 0:
        print(f"\r{i}/{len(universities)} ({(100*i/len(universities)):.1f}%): {(time() - start):.3f} sec ",end='')
    university_countries.append(get_country(locat))
print(f"\r{len(universities)}/{len(universities)} (100.0%): {(time() - start):.3f} sec ")

160/7710 (2.1%): 25.917 sec Sleep
Sleep
220/7710 (2.9%): 45.363 sec Sleep
Sleep
Sleep
Service unavailable
Sleep
Sleep
Sleep
Service unavailable
260/7710 (3.4%): 81.272 sec Sleep
Sleep
Sleep
Service unavailable
660/7710 (8.6%): 159.325 sec Sleep
Sleep
740/7710 (9.6%): 182.171 sec Sleep
Sleep
Sleep
Service unavailable
920/7710 (11.9%): 219.402 sec Sleep
Sleep
Sleep
Service unavailable
960/7710 (12.5%): 244.190 sec Sleep
1460/7710 (18.9%): 331.939 sec Sleep
2680/7710 (34.8%): 516.816 sec Sleep
Sleep
Sleep
Service unavailable
2840/7710 (36.8%): 554.424 sec Sleep
Sleep
4060/7710 (52.7%): 752.948 sec Sleep
Sleep
Sleep
Service unavailable
6160/7710 (79.9%): 1070.082 sec Sleep
Sleep
Sleep
Service unavailable
6200/7710 (80.4%): 1087.697 sec Sleep
Sleep
Sleep
Service unavailable
7710/7710 (100.0%): 1313.772 sec 


In [41]:
not_none = 0
for i, univers in enumerate(universities):
    if university_countries[i]: not_none+=1
    print(f"{univers}:\n\t{university_countries[i]}")

print(not_none,'/',len(universities))

Architectural Engineering Department, Weihai Vocational College, Weihai, Shandong, China:
	China
Los Alamos National Lab.  MS J577 Los Alamos NM 87545:
	None
School of Computer and Science, Nanjing University of Information Science and Technology, Nanjing, China:
	China
Dept. Electrical Engineering and Computer Science, University of Missouri, Columbia, MO, USA:
	United States
Universit&#x00E9; Paris-Saclay, CEA, List,Palaiseau,France,F-91120:
	France
Department of Marine, Earth and Atmospheric Sciences, North Carolina State University, Raleigh, NC 27695, USA:
	United States
Department of Physics, Micro and Nanotechnology Department, Programmable Photonics Group, Middle East Technical University, Ankara, Turkey:
	Turkey
Multimedia IP Research Center, KETI, Seongnam, Korea:
	None
Faculty of Built Environment &amp; Surveying,Geomatics Engineering,Johor Bahru,Malaysia:
	Malaysia
University of Padova,Department of Information Engineering,Italy:
	Italy
Department of Physics, University of M

In [44]:
print(set(location_countries))
print(set(university_countries))

{'Switzerland', 'India', 'Canada', 'Romania', 'Germany', None, 'Singapore', 'Hungary', 'Japan', 'Australia', 'Spain', 'United Kingdom', 'Russia', 'Belgium', 'Finland', 'Portugal', 'Austria', 'Netherlands', 'Italy', 'Sweden', 'Brazil', 'France', 'Lithuania', 'Turkey', 'United States'}
{'Serbia', 'Czech Republic', None, 'Colombia', 'Japan', 'Algeria', 'Chile', 'United Kingdom', 'Kazakhstan', 'Cameroon', 'Slovakia', 'Iran', 'United Arab Emirates', 'Ukraine', 'India', 'Indonesia', 'Canada', 'Germany', 'Taiwan', 'Norway', 'Azerbaijan', 'Greece', 'Spain', 'Denmark', 'China', 'Saudi Arabia', 'Thailand', 'Israel', 'Portugal', 'Austria', 'Liechtenstein', 'Egypt', 'Sweden', 'France', 'Slovenia', 'South Korea', 'Morocco', 'Ireland', 'Turkey', 'Bulgaria', 'Qatar', 'North Macedonia', 'Latvia', 'New Zealand', 'Switzerland', 'Bosnia and Herzegovina', 'Estonia', 'South Africa', 'Hungary', 'Sierra Leone', 'Russia', 'Belgium', 'Finland', 'Nigeria', 'Croatia', 'Luxembourg', 'Netherlands', 'Iraq', 'Brazil

In [42]:
import json

# save to file:
location_dict = {locat: location_countries[i] for i, locat in enumerate(locations)}
with open('location_dict.json', "w") as file:
    json.dump(location_dict, file)

university_dict = {univ: university_countries[i] for i, univ in enumerate(universities)}
with open('university_dict.json', "w") as file:
    json.dump(university_dict, file)