In [28]:
import csv
import re

MONTHS = {
    'january': '01', 'jan': '01',
    'february': '02', 'feb': '02',
    'march': '03', 'mar': '03',
    'april': '04', 'apr': '04',
    'may': '05',
    'june': '06', 'jun': '06',
    'july': '07', 'jul': '07',
    'august': '08', 'aug': '08',
    'september': '09', 'sep': '09', 'sept': '09',
    'october': '10', 'oct': '10',
    'november': '11', 'nov': '11',
    'december': '12', 'dec': '12'
}


In [29]:
def remove_ordinal_suffix(day):
    return re.sub(r'(st|nd|rd|th)', '', day, flags=re.I)

def zero_pad(val):
    return str(val).zfill(2)


In [30]:
def parse_date_v2(text):
    t = text
    # Month name first: March 5, 2023 / Mar 5th 23 / Dec 25 2023
    m = re.search(r'([A-Za-z]+)\s+(\d{1,2})(?:st|nd|rd|th)?(?:,)?\s+(\d{2,4})', t)
    if m:
        mon, d, y = m.groups()
        d = zero_pad(remove_ordinal_suffix(d))
        mon = MONTHS.get(mon.lower()[:3])
        if mon:
            if len(y) == 4:
                return f"{d}/{mon}/{y}"
            else:
                y = '20' + y if int(y) < 50 else '19' + y
                return f"{d}/{mon}/{y}"
    # Day Month Year, e.g., 5 March 2023, 5th March 2023
    m = re.search(r'(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})', t)
    if m:
        d, mon, y = m.groups()
        d = zero_pad(remove_ordinal_suffix(d))
        mon = MONTHS.get(mon.lower()[:3])
        if mon:
            return f"{d}/{mon}/{y}"
    # Year-Month-Day, e.g., 2022-12-31, 2022.12.31, 2022/12/31
    m = re.search(r'(\d{4})[-./](\d{1,2})[-./](\d{1,2})', t)
    if m:
        y, mon, d = m.groups()
        return f"{zero_pad(d)}/{zero_pad(mon)}/{y}"
    # Day-Month-Year, e.g., 31-12-2022, 31.12.2022, 31/12/2022
    m = re.search(r'(\d{1,2})[-./](\d{1,2})[-./](\d{4})', t)
    if m:
        d, mon, y = m.groups()
        # If month is > 12, switch with day
        if int(mon) > 12:
            return f"{zero_pad(mon)}/{zero_pad(d)}/{y}"
        return f"{zero_pad(d)}/{zero_pad(mon)}/{y}"
    # Disambiguate slash formats — MM/DD/YYYY vs DD/MM/YYYY
    m = re.search(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', t)
    if m:
        a, b, y = m.groups()
        a_i, b_i = int(a), int(b)
        # If month is > 12, switch with day
        if b_i > 12 and a_i <= 12:
            return f"{zero_pad(b)}/{zero_pad(a)}/{y}"
        elif a_i > 12 and b_i <= 12:
            return f"{zero_pad(a)}/{zero_pad(b)}/{y}"
        elif a_i > 12 and b_i > 12:
            # Both are invalid months, just return as is
            return f"{zero_pad(a)}/{zero_pad(b)}/{y}"
        else:
            return f"{zero_pad(a)}/{zero_pad(b)}/{y}"
    # Two-digit year numeric formats with - . /
    m = re.search(r'(\d{1,2})[-./](\d{1,2})[-./](\d{2})', t)
    if m:
        d, mon, y = m.groups()
        y = '20' + y if int(y) < 50 else '19' + y
        # If month is > 12, switch with day
        if int(mon) > 12:
            return f"{zero_pad(mon)}/{zero_pad(d)}/{y}"
        return f"{zero_pad(d)}/{zero_pad(mon)}/{y}"
    # Day first with month name, e.g., 1st of January 2000
    m = re.search(r'(\d{1,2})(?:st|nd|rd|th)?(?:\s+of)?\s+([A-Za-z]+),?\s+(\d{4})', t)
    if m:
        d, mon, y = m.groups()
        d = zero_pad(remove_ordinal_suffix(d))
        mon = MONTHS.get(mon.lower()[:3])
        if mon:
            return f"{d}/{mon}/{y}"
    # Month name with ordinal and comma, e.g., December 5th, 2022
    m = re.search(r'([A-Za-z]+)\s+(\d{1,2})(?:st|nd|rd|th)?,\s+(\d{4})', t)
    if m:
        mon, d, y = m.groups()
        d = zero_pad(remove_ordinal_suffix(d))
        mon = MONTHS.get(mon.lower()[:3])
        if mon:
            return f"{d}/{mon}/{y}"
    return None

In [31]:
def load_dataset(filename):
    inputs = []
    expected_outputs = []
    with open(filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # skip header
        for row in reader:
            if len(row) >= 2:
                inp = row[0].strip()
                exp = row[1].strip()
                if inp and exp:
                    inputs.append(inp)
                    expected_outputs.append(exp)
    return inputs, expected_outputs


In [32]:
filename = "date_parser_testcases.csv"
inputs, expected_outputs = load_dataset(filename)
total = len(inputs)
correct = 0
wrong_cases = []
for inp, exp in zip(inputs, expected_outputs):
    pred = parse_date_v2(inp)
    if pred == exp:
        correct += 1
    else:
        wrong_cases.append((inp, exp, pred))
print(f"Total cases tested: {total}")
print(f"Correct predictions: {correct}")
print(f"Accuracy: {correct / total * 100:.2f}%")
if wrong_cases:
    print("\nSome incorrect cases:")
    for inp, exp, pred in wrong_cases[:10]:
        print(f"Input:      {inp}")
        print(f"Expected:   {exp}")
        print(f"Predicted:  {pred}")


Total cases tested: 100
Correct predictions: 97
Accuracy: 97.00%

Some incorrect cases:
Input:      Input
Expected:   Expected Output
Predicted:  None
Input:      We celebrate Christmas every year on 25th Dec, including 2024.
Expected:   25/12/2024
Predicted:  None
Input:      We celebrate Independence Day on the 4th of July every year, including 2022.
Expected:   04/07/2022
Predicted:  None
