# Parsing News Headlines

In [1]:
from dataclasses import asdict, dataclass
from pathlib import Path
from pprint import pprint
import re
import sys
from typing import Optional

import geonamescache
import pandas as pd
from unidecode import unidecode

## Prepare regex expressions for cities and countries 

NOTES:
- Some city names, after being `unidecode`d, look like common English words that are found in headlines. To avoid finding false positives, these cities are removed.
- City and country lists are sorted in reverse so that the longest string is matched first.

In [2]:
gc = geonamescache.GeonamesCache()

countries = (unidecode(v["name"]) for v in gc.get_countries().values())
reverse_sorted_countries = sorted(countries, reverse=True)
COUNTRIES_RE = re.compile(
    rf"\b(?P<country>{'|'.join(reverse_sorted_countries)})\b", flags=re.IGNORECASE
)

poorly_unidecoded_cities = ["Çan", "Comé", "Hīt", "Of"]
for c in poorly_unidecoded_cities:
    pprint(gc.get_cities_by_name(c))

cities = (
    unidecode(v["name"])
    for v in gc.get_cities().values()
    if v["name"] not in poorly_unidecoded_cities
)
reverse_sorted_cities = sorted(cities, reverse=True)
CITIES_RE = re.compile(
    rf"\b(?P<city>{'|'.join(reverse_sorted_cities)})\b", flags=re.IGNORECASE
)

[{'749795': {'admin1code': '17',
             'countrycode': 'TR',
             'geonameid': 749795,
             'latitude': 40.03328,
             'longitude': 27.05236,
             'name': 'Çan',
             'population': 26201,
             'timezone': 'Europe/Istanbul'}}]
[{'2394824': {'admin1code': '11',
              'countrycode': 'BJ',
              'geonameid': 2394824,
              'latitude': 6.40764,
              'longitude': 1.88198,
              'name': 'Comé',
              'population': 29208,
              'timezone': 'Africa/Porto-Novo'}}]
[{'95788': {'admin1code': '01',
            'countrycode': 'IQ',
            'geonameid': 95788,
            'latitude': 33.63664,
            'longitude': 42.82768,
            'name': 'Hīt',
            'population': 31901,
            'timezone': 'Asia/Baghdad'}}]
[{'741240': {'admin1code': '61',
             'countrycode': 'TR',
             'geonameid': 741240,
             'latitude': 40.94055,
             'longitude': 

In [3]:
@dataclass
class ParsedHeadline:
    headline: str
    city: Optional[str]
    country: Optional[str]


def find_city_country(headline: str) -> ParsedHeadline:
    city_result = CITIES_RE.search(headline)
    country_result = COUNTRIES_RE.search(headline)
    return ParsedHeadline(
        headline=headline,
        city=city_result.group("city") if city_result else None,
        country=country_result.group("country") if country_result else None,
    )


with open(Path("data/headlines.txt"), "r") as f:
    headlines = (h.rstrip() for h in f.readlines())

data = (asdict(find_city_country(h)) for h in headlines)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 200)
pd.DataFrame(data)

Unnamed: 0,headline,city,country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami Beach,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,
5,Trinidad confirms first Zika case,Trinidad,
6,Zika Concerns are Spreading in Houston,Houston,
7,Geneve Scientists Battle to Find Cure,Geneve,
8,The CDC in Atlanta is Growing Worried,Atlanta,
9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,
