In [7]:
import re
import pandas as pd


def clean_day(day):
    return re.sub(r'(st|nd|rd|th)', '', day)

def parse_date(text):
    text = text.lower()

    # Normalize short month names
    short_months = {
        'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
        'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
        'sep': '09', 'sept': '09', 'oct': '10', 'nov': '11', 'dec': '12'
    }
    MONTHS = {}

    MONTHS.update(short_months)

    # 1st of January 2000
    match = re.search(r'(\d{1,2})(st|nd|rd|th)?\s+of\s+([a-z]+)[,\s\-]+(\d{4})', text)
    if match:
        day = clean_day(match.group(1)).zfill(2)
        month = MONTHS.get(match.group(3)[:3])
        year = match.group(4)
        return f"{day}/{month}/{year}"

    # 21st June 2024
    match = re.search(r'(\d{1,2})(st|nd|rd|th)?[\s\-]+([a-z]+)[,\s\-]+(\d{4})', text)
    if match:
        day = clean_day(match.group(1)).zfill(2)
        month = MONTHS.get(match.group(3)[:3])
        year = match.group(4)
        return f"{day}/{month}/{year}"

    # June 21st, 2024
    match = re.search(r'([a-z]+)[\s\-]+(\d{1,2})(st|nd|rd|th)?[,\s\-]+(\d{4})', text)
    if match:
        month = MONTHS.get(match.group(1)[:3])
        day = clean_day(match.group(2)).zfill(2)
        year = match.group(4)
        return f"{day}/{month}/{year}"

    # 1987/11/23 or 2024-06-21
    match = re.search(r'(\d{4})[\/\-](\d{1,2})[\/\-](\d{1,2})', text)
    if match:
        year = match.group(1)
        month = match.group(2).zfill(2)
        day = match.group(3).zfill(2)
        return f"{day}/{month}/{year}"

    # 21/06/2024 or 21.06.2024 or 5/6/19
    # Handles DD/MM/YYYY, MM/DD/YYYY, 2-digit years, and . separators
    match = re.search(r'(\d{1,2})[\/\-\.](\d{1,2})[\/\-\.](\d{2,4})', text)
    if match:
        first = int(match.group(1))
        second = int(match.group(2))
        year = match.group(3)
        if len(year) == 2:
            year = "19" + year if int(year) > 25 else "20" + year
        # Heuristic: If first > 12, it's clearly DD/MM/YYYY
        if first > 12:
            day = str(first).zfill(2)
            month = str(second).zfill(2)
        # If second > 12, then it's MM/DD/YYYY
        elif second > 12:
            day = str(second).zfill(2)
            month = str(first).zfill(2)
        # Else default to DD/MM/YYYY
        else:
            day = str(first).zfill(2)
            month = str(second).zfill(2)
        return f"{day}/{month}/{year}"


    return "No date found"


In [8]:
df = pd.read_csv("date_parser_testcases.csv")
df.head()


Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021


In [9]:
# Cell 3: Apply date parser
df["Parsed Date"] = df["Input"].apply(parse_date)
df.head(10)

Unnamed: 0,Input,Expected Output,Parsed Date
0,"The event will take place on March 5, 2023.",05/03/2023,05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022,31/12/2022
3,We met on 1st of January 2000.,01/01/2000,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/2021
5,Let's catch up on 02.04.2022.,02/04/2022,02/04/2022
6,The project started on 5/6/19.,05/06/2019,05/06/2019
7,He was born on 1987/11/23.,23/11/1987,23/11/1987
8,Christmas is on 25th Dec 2024.,25/12/2024,25/12/2024
9,"The meeting is set for April 03, 2020.",03/04/2020,03/04/2020
