In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("date_parser_testcases.csv")

In [3]:
df

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021
...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023
96,The final date for submission is 30th November...,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990


In [4]:
import re

def parse_date(text):
    """
    Extract date from text and convert to DD/MM/YYYY format.
    
    This function tries to find various date patterns in text without using 
    any machine learning or external date parsing libraries.
    """
    
    # Dictionary for month name conversion
    months = {
        'january': '01', 'jan': '01',
        'february': '02', 'feb': '02', 
        'march': '03', 'mar': '03',
        'april': '04', 'apr': '04',
        'may': '05',
        'june': '06', 'jun': '06',
        'july': '07', 'jul': '07',
        'august': '08', 'aug': '08',
        'september': '09', 'sep': '09', 'sept': '09',
        'october': '10', 'oct': '10',
        'november': '11', 'nov': '11',
        'december': '12', 'dec': '12'
    }
    
    def fix_year(year_str):
        """Convert 2-digit years to 4-digit format"""
        if len(year_str) == 2:
            year = int(year_str)
            # Assume 00-49 means 2000s, 50-99 means 1900s
            if year <= 49:
                return str(2000 + year)
            else:
                return str(1900 + year)
        return year_str
        
    # ISO format: 2023-12-31
    match = re.search(r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b', text)
    if match:
        year, month, day = match.groups()
        return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
    
    # European format with dashes: 31-12-2023  
    match = re.search(r'\b(\d{1,2})-(\d{1,2})-(\d{4})\b', text)
    if match:
        day, month, year = match.groups()
        return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
    
    # Dot separated dates: 2023.12.31
    match = re.search(r'\b(\d{4})\.(\d{1,2})\.(\d{1,2})\b', text)
    if match:
        year, month, day = match.groups()
        return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
        
    # Dot separated: 31.12.2023
    match = re.search(r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b', text)
    if match:
        day, month, year = match.groups()
        return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
    
    # Slash format: 2023/12/31
    match = re.search(r'\b(\d{4})/(\d{1,2})/(\d{1,2})\b', text)
    if match:
        year, month, day = match.groups()
        return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
    
    # Regular slash format - could be DD/MM/YYYY or MM/DD/YYYY
    # Need to figure out which is which
    match = re.search(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', text)
    if match:
        first, second, year = match.groups()
        first_val = int(first)
        second_val = int(second)
        
        # If first number > 12, it must be the day
        if first_val > 12:
            return f"{first.zfill(2)}/{second.zfill(2)}/{year}"
        # If second number > 12, it must be the day  
        elif second_val > 12:
            return f"{second.zfill(2)}/{first.zfill(2)}/{year}"
        # Both could be valid - assume DD/MM format
        else:
            return f"{first.zfill(2)}/{second.zfill(2)}/{year}"
    
    # Two digit years: 5/6/19
    match = re.search(r'\b(\d{1,2})/(\d{1,2})/(\d{2})\b', text)
    if match:
        first, second, year = match.groups()
        full_year = fix_year(year)
        first_val = int(first)
        second_val = int(second)
        
        if first_val > 12:
            return f"{first.zfill(2)}/{second.zfill(2)}/{full_year}"
        elif second_val > 12:
            return f"{second.zfill(2)}/{first.zfill(2)}/{full_year}"
        else:
            return f"{first.zfill(2)}/{second.zfill(2)}/{full_year}"
    
    # Named months: March 5, 2023
    match = re.search(r'\b([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})\b', text.lower())
    if match:
        month_name, day, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    # Named months with ordinals: February 15th, 2022
    match = re.search(r'\b([A-Za-z]+)\s+(\d{1,2})(?:st|nd|rd|th),?\s+(\d{4})\b', text.lower())
    if match:
        month_name, day, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    # Ordinal day first: 1st of January 2000
    match = re.search(r'\b(\d{1,2})(?:st|nd|rd|th)\s+(?:of\s+)?([A-Za-z]+)(?:,?\s+|\s+)(\d{4})\b', text.lower())
    if match:
        day, month_name, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    # Simple day month year: 15 October 2023
    match = re.search(r'\b(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\b', text.lower())
    if match:
        day, month_name, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    # Ordinal with month: 25th Dec 2024
    match = re.search(r'\b(\d{1,2})(?:st|nd|rd|th)\s+([A-Za-z]+)\s+(\d{4})\b', text.lower())
    if match:
        day, month_name, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    # Special case: "including" pattern
    match = re.search(r'\b(\d{1,2})(?:st|nd|rd|th)\s+([A-Za-z]+),?\s+including\s+(\d{4})\b', text.lower())
    if match:
        day, month_name, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    # Another special case: "every year, including"
    match = re.search(r'\b(\d{1,2})(?:st|nd|rd|th)\s+of\s+([A-Za-z]+)\s+every\s+year,?\s+including\s+(\d{4})\b', text.lower())
    if match:
        day, month_name, year = match.groups()
        if month_name in months:
            month_num = months[month_name]
            return f"{day.zfill(2)}/{month_num}/{year}"
    
    return None


In [5]:
df['Parsed Output'] = df['Input'].apply(parse_date)

correct = 0
total = len(df)

for i, row in df.iterrows():
    if row['Parsed Output'] == row['Expected Output']:
        correct += 1
    else:
        print(f"Row {i} failed")

print(f"\n Accuracy: {correct}/{total} = {correct/total:.2%}")

# Display some results
print("\nFirst 10 results:")
print(df[['Input', 'Expected Output', 'Parsed Output']].head(10))

Row 50 failed

 Accuracy: 99/100 = 99.00%

First 10 results:
                                               Input Expected Output  \
0        The event will take place on March 5, 2023.      05/03/2023   
1                     Her birthday is on 07/08/1990.      07/08/1990   
2                        The deadline is 2022-12-31.      31/12/2022   
3                     We met on 1st of January 2000.      01/01/2000   
4  The concert is scheduled for 15th September, 2...      15/09/2021   
5                      Let's catch up on 02.04.2022.      02/04/2022   
6                     The project started on 5/6/19.      05/06/2019   
7                         He was born on 1987/11/23.      23/11/1987   
8                     Christmas is on 25th Dec 2024.      25/12/2024   
9             The meeting is set for April 03, 2020.      03/04/2020   

  Parsed Output  
0    05/03/2023  
1    07/08/1990  
2    31/12/2022  
3    01/01/2000  
4    15/09/2021  
5    02/04/2022  
6    05/06/2019  
7 

In [7]:
print("Cases that failed")
failed_count = 0

for i, row in df.iterrows():
    if row['Parsed Output'] != row['Expected Output']:
        failed_count += 1
        print(f"\nCase:")
        print(f"  Text: {row['Input']}")
        print(f"  Expected: {row['Expected Output']}")
        print(f"  Got: {row['Parsed Output']}")

if failed_count == 0:
    print("All cases passed")

Cases that failed

Case:
  Text: Input
  Expected: Expected Output
  Got: None
