In [1]:
# There are parts of this Notebook that will be executed (Or not) based on this flag
debug = False

## Imports
from unidecode import unidecode # To remove accents and stuff from texts
import re                       #Regular expression library
import geonamescache            # For database of places on earth
from hashlib import md5         # Used to generate unique dictionary keys for the lines
import numpy as np              # for array manipulation
import pandas                   # Final resut needs to be in this format
import json                     # For saving the results

In [2]:
export_json='../data/found_cities.json'
with open(export_json, 'r') as checkJSONDump:
    jsonData=json.loads(checkJSONDump.read())
print("Example list of headlines with data:\n",jsonData[:5])

Example list of headlines with data:
 [{'line': 'Zika Outbreak Hits Miami', 'place': 'Miami', 'country': nan, 'places': ['Miami']}, {'line': 'Could Zika Reach New York City?', 'place': 'New York City', 'country': nan, 'places': ['York', 'New York City', 'New York']}, {'line': 'First Case of Zika in Miami Beach', 'place': 'Miami Beach', 'country': nan, 'places': ['Miami', 'Miami Beach']}, {'line': 'Mystery Virus Spreads in Recife, Brazil', 'place': 'Recife', 'country': 'Brazil', 'places': ['Recife']}, {'line': 'Dallas man comes down with case of Zika', 'place': 'Dallas', 'country': nan, 'places': ['Dallas']}]


### Normalization of city names

Many of the names in the geonamescache database have their names with accents and non ASCII characters.

In order to make search easier, I "normalize" the names in the list.

In [3]:
gc = geonamescache.GeonamesCache()
cities = gc.get_cities()

# 'Normalize' names of cities in order to make search more accurate
for id, city in list(cities.items()):
    cities[id]['normalized'] = unidecode(city['name']).lower()

if debug:
    export_json='../data/all_cities.json'
    with open(export_json, 'w') as openJSON:
        openJSON.write(json.dumps(gc.get_cities(), indent=4, sort_keys=True))
    export_json='../data/all_countries.json'
    with open(export_json, 'w') as openJSON:
        openJSON.write(json.dumps(gc.get_countries(), indent=4, sort_keys=True))
    export_json='../data/all_continents.json'
    with open(export_json, 'w') as openJSON:
        openJSON.write(json.dumps(gc.get_continents(), indent=4, sort_keys=True))
    print(dir(gc))
if debug:
    for id, city in list(cities.items())[:10]:
        print(city['name'], cities[id]['normalized'])

### Helper functions

I used the following 3 helpe functions in this notebook:

`get_cities_by_name` uses the DB of city names but use the normalized value. As the data from the previous notebook had those cleaned of accents and non ASCII characters this was needed in order to have better accuracy when searching.

`extract_city_data` function to add the required data to headlines.

`get_city_by_key` this function finds a city in the genoamescache DB by geonameid instead of name. This function is used when there is more than one city with the same name.

In [4]:
def get_cities_by_name(name, dataset, debug = False):
    found = {}
    if debug:
        print('Starting searching', name)
    nameLow = unidecode(name).lower()
    for id, city in dataset.items():
        if debug:
            print('Testing id', id, city['name'], ',', city['normalized'], ', >>')
        if nameLow == city['name'] or nameLow == city['normalized']:
            found[id] = city
    if not found:
        return []
    return [found]
if debug:
    dbgCities = get_cities_by_name('springfield', cities)
    for element in dbgCities[0].items():
        print(json.dumps(element))

def extract_city_data(cities, headline):
    for cid, city in cities.items():
        headline['place']=city['name']
        headline['lat']=city['latitude']
        headline['lng']=city['longitude']
        headline['countrycode']=city['countrycode']
    return headline

def get_city_by_key(geoid, name):
    list = get_cities_by_name(name, cities)
    if geoid in list[0]:
        cities_wrapper = {}
        cities_wrapper[geoid] = list[0][geoid]
        return cities_wrapper
    return ('More than one found', list[0])

In [5]:
allFoundCities = 0
allFoundDuplicatedCities = 0

# We search the city in the genames cache and...
for headline in jsonData:
    if not headline['place']:
        continue
    foundCities = get_cities_by_name(headline['place'], cities)
    # The city is not found.
    if len(foundCities) < 1:
        continue;
    #There's exactly one city. Extract the relevant data (Latitude, longitude, countrycode)
    if len(foundCities[0].keys()) == 1:
        allFoundCities = allFoundCities +1
        headline = extract_city_data(foundCities[0], headline)
        continue
    # There's more than one city found. We add the needed fienls to headlines but empty as placeholders
    dupcities = []
    for cid, _city in foundCities[0].items():
        dupcities.append(_city)
    headline['lat'] = 0
    headline['lng'] = 0
    headline['countrycode'] = np.NaN
    headline['list_of_cities'] = dupcities
    allFoundDuplicatedCities = allFoundDuplicatedCities + 1
            
print("Amount of cities found once:", allFoundCities)
print("amount of cities found more than once:", allFoundDuplicatedCities)


Amount of cities found once: 432
amount of cities found more than once: 170


In [6]:
for headline in jsonData:
    if 'list_of_cities' in headline:
        # -The existance of this property indicates that we found more than one city with this name
        maxPop=0
        geoid=0
        nonUS = False
        country = ''
        for city in headline['list_of_cities']:
            # We find the city with the biggest population and use that one for the data
            if city['population'] > maxPop:
                maxPop = city['population']
                geoid = str(city['geonameid'])
                country = city['countrycode']
            if city['countrycode'] != 'US':
                nonUS = True
        if nonUS:
            isUS = "There is a city from another country than US here"
        else:
            isUS = ''
        if nonUS and country == "US":
            isUS = isUS + " but we chose an US city."
        else:
            isUS = '.'
        selectedCity = get_city_by_key(geoid, headline['place'])
        headline = extract_city_data(selectedCity, headline)
        #headline['countrycode'] = selectedCity['countrycode']

In [7]:
countries = gc.get_countries()
for headline in jsonData:
    if 'countrycode' not in headline:
        continue
    if headline['countrycode'] not in countries:
        print("Country code", headline['countrycode'], "not found in list of countries:", headline['place'])
        continue
    headline['country'] = countries[headline['countrycode']]['name']

In [8]:
# Convert headlines to pandas dataframe
headlinesArray = []
for headline in jsonData:
    #print(headline)
    if 'lat' not in headline:
        continue
    headlinesArray.append([headline['line'], headline['place'], headline['lat'], headline['lng'], headline['countrycode']])
df = pandas.DataFrame(headlinesArray, columns = ['headline', 'city', 'latitude', 'longitude', 'countrycode'])#.reset_index(drop=True)
blankIndex=[''] * len(df)
df.index=blankIndex
df[:10]

Unnamed: 0,headline,city,latitude,longitude,countrycode
,Zika Outbreak Hits Miami,Miami,25.77427,-80.19366,US
,Could Zika Reach New York City?,New York City,40.71427,-74.00597,US
,First Case of Zika in Miami Beach,Miami Beach,25.79065,-80.13005,US
,"Mystery Virus Spreads in Recife, Brazil",Recife,-8.05389,-34.88111,BR
,Dallas man comes down with case of Zika,Dallas,32.78306,-96.80667,US
,Trinidad confirms first Zika case,Trinidad,-14.83333,-64.9,BO
,Zika Concerns are Spreading in Houston,Houston,29.76328,-95.36327,US
,Geneve Scientists Battle to Find Cure,Genève,46.20222,6.14569,CH
,The CDC in Atlanta is Growing Worried,Atlanta,33.749,-84.38798,US
,Zika Infested Monkeys in Sao Paulo,São Paulo,-23.5475,-46.63611,BR


In [9]:
# Export the data found as JSON file
export_json='../data/found_cities_locations.json'
with open(export_json, 'w') as openJSON:
    openJSON.write(json.dumps(list(jsonData)))

In [10]:
with open(export_json, 'r') as checkJSONDump:
    json_test=json.loads(checkJSONDump.read())
print(json_test[:5])

[{'line': 'Zika Outbreak Hits Miami', 'place': 'Miami', 'country': 'United States', 'places': ['Miami'], 'lat': 25.77427, 'lng': -80.19366, 'countrycode': 'US'}, {'line': 'Could Zika Reach New York City?', 'place': 'New York City', 'country': 'United States', 'places': ['York', 'New York City', 'New York'], 'lat': 40.71427, 'lng': -74.00597, 'countrycode': 'US'}, {'line': 'First Case of Zika in Miami Beach', 'place': 'Miami Beach', 'country': 'United States', 'places': ['Miami', 'Miami Beach'], 'lat': 25.79065, 'lng': -80.13005, 'countrycode': 'US'}, {'line': 'Mystery Virus Spreads in Recife, Brazil', 'place': 'Recife', 'country': 'Brazil', 'places': ['Recife'], 'lat': -8.05389, 'lng': -34.88111, 'countrycode': 'BR'}, {'line': 'Dallas man comes down with case of Zika', 'place': 'Dallas', 'country': 'United States', 'places': ['Dallas'], 'lat': 32.78306, 'lng': -96.80667, 'countrycode': 'US', 'list_of_cities': [{'geonameid': 4684888, 'name': 'Dallas', 'latitude': 32.78306, 'longitude': 