In [1]:
import re
import geonamescache
import pandas as pd
import numpy as np
import unidecode


In [2]:
# Raw Country and City data from geonamescache
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()



# prepare city name list to use
# in regex matching with headline later
city_names = []

for city in cities:
    # Avoid places with names like "Of"
    # it appears in headlines a lot
    if len(cities[city]['name']) > 2:
        city_names.append(unidecode.unidecode(cities[city]['name']))

# Sort names to have the longest first
# so that we match "San Jose" before "San"
city_names.sort(key=len, reverse=True)



# prepare country name list to use
# in regex matching with headline later
country_names = []

for country in countries:
    # Avoid places with names like "Of"
    # it appears in headlines a lot
    if len(countries[country]['name']) > 2:
        country_names.append(unidecode.unidecode(countries[country]['name']))

        
# Sort names to have the longest first
country_names.sort(key=len, reverse=True)



# Load headlines data from file
raw_headlines = []
file = open("data/headlines.txt", 'r')
for line in file.readlines():
    raw_headlines.append(line.strip())
file.close()

# Create initial panda dataframe with headline column
headlines = pd.DataFrame(raw_headlines, columns=['headline'])


In [4]:
# Create regex expression from all city names, including word boundaries
city_name_regex = r"("+r"\b|\b".join(city_names)+r")"

# Create regex expression from all country names, including word boundaries
country_name_regex = r"("+r"\b|\b".join(country_names)+r")"

# Iterate over headlines dataframe row
# and fill in countries and cities where 
# there is a match
for idx, row in headlines.iterrows():
        # Matching any country names
        matched_countries = re.search(country_name_regex, row.headline, flags=re.IGNORECASE)
        if matched_countries is not None:
            headlines.at[idx, 'countries'] = matched_countries[0]
        
        # Matching any City names
        matched_cities = re.search(city_name_regex, row.headline, flags=re.IGNORECASE)
        if matched_cities is not None:
            headlines.at[idx, 'cities'] = matched_cities[0]

            
            
            
            
            
#
# The DataFrame "headlines" now has its data filled in :D
#

print(headlines.head())

                                  headline         cities countries
0                 Zika Outbreak Hits Miami          Miami       NaN
1          Could Zika Reach New York City?  New York City       NaN
2        First Case of Zika in Miami Beach    Miami Beach       NaN
3  Mystery Virus Spreads in Recife, Brazil         Recife    Brazil
4  Dallas man comes down with case of Zika         Dallas       NaN
