# Parsing dates experiment

In [None]:
!pip install dateparser

## sample dates

In [13]:
sample_dates = [
    "1792",
    "1791-09-22",
    "c. 1958",
    "around 3/2/1983",
    "1501-1600"
]

## `dateparser`

In [16]:
import dateparser
from dateparser.search import search_dates

In [14]:
[dateparser.parse(d) for d in sample_dates]

[datetime.datetime(1792, 7, 8, 0, 0),
 datetime.datetime(1791, 9, 22, 0, 0),
 datetime.datetime(1958, 7, 8, 0, 0),
 None,
 None]

In [15]:
[search_dates(d) for d in sample_dates]

[[('1792', datetime.datetime(1792, 7, 8, 0, 0))],
 [('1791-09-22', datetime.datetime(1791, 9, 22, 0, 0))],
 [('1958', datetime.datetime(1958, 7, 8, 0, 0))],
 [('3/2/1983', datetime.datetime(1983, 3, 2, 0, 0))],
 None]

## `heritageconnector` (year only)
this exists in `heritageconnector.entity_matching.filtering.Filter`, and should be moved if we use it for ETL

In [25]:
import re

def get_year_from_date_value(datestring: str) -> int:
    """
    Looks for a year mention in a date-like string by finding a run of 1-4 digits if BCE, 
    or 4 digits if not BCE.

    Returns None if no date found, the date if only 1 is found, the average of the two if 
    two dates are found, and the first date if more than 2 dates are found.

    Args:
        date (str)

    Returns:
        str:
    """

    datestring = str(datestring)

    if "BCE" in datestring:
        datestring = datestring.replace("BCE", "").strip()
        year_matches = re.findall(r"(\d{1,4})", datestring)
        # BCE dates are recorded in Wikidata as negative years
        year_matches = [-1 * int(match) for match in year_matches]

    else:
        # look for (\d{4)) - avoiding trying to convert "about 1984ish" into
        # a date format using datetime
        year_matches = re.findall(r"(\d{4})", datestring)

    try:
        if len(year_matches) == 0:
            return None
        elif len(year_matches) == 1 or len(year_matches) > 2:
            return int(year_matches[0])
        elif len(year_matches) == 2:
            # assume in the format "333-345 BCE" / "1983-1984"
            return (int(year_matches[0]) + int(year_matches[1])) / 2
    except ValueError as e:
        print(e)

In [26]:
[get_year_from_date_value(d) for d in sample_dates]

[1792, 1791, 1958, 1983, 1550.5]