# Parsing News Headlines

In [1]:
from dataclasses import asdict, dataclass
from pathlib import Path
from pprint import pprint
import re
import sys
from typing import Optional

import geonamescache
import pandas as pd
from unidecode import unidecode

## Prepare regex expressions for cities and countries 

NOTES:
- Some city names, after being `unidecode`d, look like common English words that are found in headlines. To avoid finding false positives, these cities are removed.
- City and country lists are sorted in reverse so that the longest string is matched first.

In [2]:
gc = geonamescache.GeonamesCache()

countries = (unidecode(v["name"]) for v in gc.get_countries().values())
reverse_sorted_countries = sorted(countries, reverse=True)
COUNTRIES_RE = re.compile(
    rf"\b(?P<country>{'|'.join(reverse_sorted_countries)})\b", flags=re.IGNORECASE
)

poorly_unidecoded_cities = ["Çan", "Comé", "Hīt", "Of"]
for c in poorly_unidecoded_cities:
    pprint(gc.get_cities_by_name(c))

cities = (
    unidecode(v["name"])
    for v in gc.get_cities().values()
    if v["name"] not in poorly_unidecoded_cities
)
reverse_sorted_cities = sorted(cities, reverse=True)
CITIES_RE = re.compile(
    rf"\b(?P<city>{'|'.join(reverse_sorted_cities)})\b", flags=re.IGNORECASE
)

[{'749795': {'admin1code': '17',
             'countrycode': 'TR',
             'geonameid': 749795,
             'latitude': 40.03328,
             'longitude': 27.05236,
             'name': 'Çan',
             'population': 26201,
             'timezone': 'Europe/Istanbul'}}]
[{'2394824': {'admin1code': '11',
              'countrycode': 'BJ',
              'geonameid': 2394824,
              'latitude': 6.40764,
              'longitude': 1.88198,
              'name': 'Comé',
              'population': 29208,
              'timezone': 'Africa/Porto-Novo'}}]
[{'95788': {'admin1code': '01',
            'countrycode': 'IQ',
            'geonameid': 95788,
            'latitude': 33.63664,
            'longitude': 42.82768,
            'name': 'Hīt',
            'population': 31901,
            'timezone': 'Asia/Baghdad'}}]
[{'741240': {'admin1code': '61',
             'countrycode': 'TR',
             'geonameid': 741240,
             'latitude': 40.94055,
             'longitude': 

## Parsing and cleaning the data

- Ensure the raw headlines are unique.
- Filter out the headline if a city or country name can not be found in the headline.
- Check DataFrame statistics make sense.

In [3]:
@dataclass
class ParsedHeadline:
    headline: str
    city: Optional[str]
    country: Optional[str]


def find_city_country(headline: str) -> ParsedHeadline:
    city_result = CITIES_RE.search(headline)
    country_result = COUNTRIES_RE.search(headline)
    return ParsedHeadline(
        headline=headline,
        city=city_result.group("city") if city_result else None,
        country=country_result.group("country") if country_result else None,
    )


with open(Path("data/raw/headlines.txt"), "r") as f:
    headlines = {h.rstrip() for h in f.readlines()}

data = (asdict(find_city_country(h)) for h in headlines)
parsed_headlines = pd.DataFrame(data)
parsed_headlines = parsed_headlines[
    ~parsed_headlines.city.isnull() | ~parsed_headlines.country.isnull()
]
parsed_headlines.describe().loc[["count", "unique"], :]

Unnamed: 0,headline,city,country
count,608,608,15
unique,608,576,10


In [4]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 200)
parsed_headlines

Unnamed: 0,headline,city,country
1,Case of Hepatitis A Reported in Calgary,Calgary,
2,More people in Nadi are infected with HIV every year,Nadi,
3,Lower Hospitalization in Janesville after Varicella Vaccine becomes Mandatory,Janesville,
4,Contaminated Meat Brings Trouble for Bonn Farmers,Bonn,
5,Molo Cholera Spread Causing Concern,Molo,
6,Bronchitis re-emerges in Tehran,Tehran,
7,Schools in Yaounde Closed Due to Mumps Outbreak,Yaounde,
8,Zika arrives in San Benito,San Benito,
9,Lower Hospitalization in Monroe after Hepatitis D Vaccine becomes Mandatory,Monroe,
10,Spanish Flu Outbreak in Lisbon,Lisbon,


## Store results

Serialize the resulting DataFrame in parquet format.

In [5]:
parsed_headlines.to_parquet(
    Path("data/processed/parsed-headlines.parquet"), index=False
)