# Part 2

In [1]:
import pandas as pd
import re
import datetime

In [2]:
test_dates = pd.read_csv("data/date_parser_testcases.csv")
test_dates.head()

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05-03-2023
1,Her birthday is on 07/08/1990.,07-08-1990
2,The deadline is 2022-12-31.,31-12-2022
3,We met on 1st of January 2000.,01-01-2000
4,"The concert is scheduled for 15th September, 2...",15-09-2021


In [3]:
def extract_dates(text):
    if pd.isna(text):
        return None
    
    text = str(text).strip()
    
    basic_patterns = [
        r"(\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})", # YYYY-MM-DD or YYYY/MM/DD or YYYY.MM.DD
        r"(\d{1,2}[-/\.]\d{1,2}[-/\.]\d{4})", # DD-MM-YYYY or DD/MM/YYYY or DD.MM.YYYY
        r"(\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2})", # DD-MM-YY or DD/MM/YY or DD.MM.YY
        r"(\d{8})", # YYYYMMDD or DDMMYYYY
        r"(\d{6})", # DDMMYY or YYMMDD
    ]
    
    text_patterns = [ 
        r"(\d{1,2}(?:st|nd|rd|th)\s+of\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*,?\s*\d{2,4})", # 1st of Jan 2020 or 2nd of Feb 2021 etc.
        r"(\d{1,2}(?:st|nd|rd|th)?\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*,?\s*\d{2,4})", # 1st Jan 2020 or 2nd Feb 2021 etc.
        r"((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}(?:st|nd|rd|th)?\s*,?\s*\d{2,4})", # Jan 1st 2020 or Feb 2nd 2021 etc.
        r"(\d{1,2}(?:st|nd|rd|th)?\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z,]*\s*\d{2,4})", # 1st Jan 2020 or 2nd Feb 2021 etc.
    ]
    
    extended_patterns = [
        r"(\d{1,2}(?:st|nd|rd|th)?\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*.*?\d{4})", # 1st Jan 2020 or 2nd Feb 2021 etc.
        r"(the\s+\d{1,2}(?:st|nd|rd|th)?\s+of\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*.*?\d{4})", # the 1st of Jan 2020 or the 2nd of Feb 2021 etc.
    ]
    
    all_patterns = basic_patterns + text_patterns + extended_patterns
        
    for pattern in all_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            date_str = match.group(0).strip()
            
            try:
                if re.match(r'\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2}', date_str):
                    date_str = re.sub(r'[-\.]', '/', date_str)
                    date_obj = datetime.datetime.strptime(date_str, '%Y/%m/%d')
                    return date_obj.strftime('%d-%m-%Y')
                
                elif re.match(r'\d{1,2}[-/\.]\d{1,2}[-/\.]\d{4}', date_str):
                    date_str = re.sub(r'[-\.]', '/', date_str)
                    try:
                        date_obj = datetime.datetime.strptime(date_str, '%d/%m/%Y')
                        return date_obj.strftime('%d-%m-%Y')
                    except ValueError:
                        date_obj = datetime.datetime.strptime(date_str, '%m/%d/%Y')
                        return date_obj.strftime('%d-%m-%Y')
                
                elif re.match(r'\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2}', date_str):
                    date_str = re.sub(r'[-\.]', '/', date_str)
                    try:
                        date_obj = datetime.datetime.strptime(date_str, '%d/%m/%y')
                        if date_obj.year < 50:
                            date_obj = date_obj.replace(year=date_obj.year + 2000)
                        return date_obj.strftime('%d-%m-%Y')
                    except ValueError:
                        date_obj = datetime.datetime.strptime(date_str, '%m/%d/%y')
                        if date_obj.year < 50:
                            date_obj = date_obj.replace(year=date_obj.year + 2000)
                        return date_obj.strftime('%d-%m-%Y')
                
                elif re.match(r'\d{8}', date_str):
                    if date_str[:4] > '1900':
                        date_obj = datetime.datetime.strptime(date_str, '%Y%m%d')
                    else:
                        date_obj = datetime.datetime.strptime(date_str, '%d%m%Y')
                    return date_obj.strftime('%d-%m-%Y')
                
                elif re.match(r'\d{6}', date_str):
                    try:
                        date_obj = datetime.datetime.strptime(date_str, '%d%m%y')
                        if date_obj.year < 50:
                            date_obj = date_obj.replace(year=date_obj.year + 2000)
                        return date_obj.strftime('%d-%m-%Y')
                    except ValueError:
                        date_obj = datetime.datetime.strptime(date_str, '%y%m%d')
                        if date_obj.year < 50:
                            date_obj = date_obj.replace(year=date_obj.year + 2000)
                        return date_obj.strftime('%d-%m-%Y')
                
                else:
                    date_str = re.sub(r'\b(\d+)(?:st|nd|rd|th)\b', r'\1', date_str)
                    date_str = re.sub(r'\s+of\s+', ' ', date_str)
                    date_str = re.sub(r'\s*,\s*', ' ', date_str)
                    date_str = re.sub(r'\s+', ' ', date_str)
                    date_str = re.sub(r'\.+$', '', date_str)
                    
                    date_parts = re.findall(r'\d+|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*', date_str, re.IGNORECASE)
                    if len(date_parts) >= 3:
                        day_part = date_parts[0]
                        month_part = date_parts[1]
                        year_part = date_parts[-1]
                        clean_date_str = f"{day_part} {month_part} {year_part}"
                        
                        date_obj = pd.to_datetime(clean_date_str, errors='coerce')
                        if pd.notnull(date_obj):
                            return date_obj.strftime('%d-%m-%Y')
                    
                    date_obj = pd.to_datetime(date_str, errors='coerce')
                    if pd.notnull(date_obj):
                        return date_obj.strftime('%d-%m-%Y')
                        
            except Exception:
                continue
    
    return None

In [4]:
test_dates['Predicted Output'] = test_dates['Input'].apply(extract_dates)

print(test_dates[['Expected Output', 'Predicted Output']].head())

accuracy = (test_dates['Expected Output'] == test_dates['Predicted Output']).mean() * 100
print(f"\nAccuracy: {accuracy:.2f}%")

  Expected Output Predicted Output
0      05-03-2023       05-03-2023
1      07-08-1990       07-08-1990
2      31-12-2022       31-12-2022
3      01-01-2000       01-01-2000
4      15-09-2021       15-09-2021

Accuracy: 100.00%


# Part 3

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

pronoun_testcases = pd.read_csv("data/pronoun_testcases.csv")
pronoun_testcases.head()

Unnamed: 0,input_text,target_gender,expected_output
0,He is going to the market.,female,She is going to the market.
1,His book is on the table.,female,Her book is on the table.
2,I saw him yesterday.,female,I saw her yesterday.
3,He hurt himself.,female,She hurt herself.
4,I called him last night.,female,I called her last night.


In [6]:
def transform_pronouns(text):
    doc = nlp(text)
    
    male_to_female = {
        'he': 'she', 'He': 'She', 'HE': 'SHE',
        'him': 'her', 'Him': 'Her', 'HIM': 'HER',
        'his': 'her', 'His': 'Her', 'HIS': 'HER',
        'himself': 'herself', 'Himself': 'Herself', 'HIMSELF': 'HERSELF'
    }
    
    female_to_male = {
        'she': 'he', 'She': 'He', 'SHE': 'HE',
        'herself': 'himself', 'Herself': 'Himself', 'HERSELF': 'HIMSELF'
    }
    
    result = ""
    
    for i, token in enumerate(doc):
        word = token.text
        transformed = word
        
        if word in male_to_female:
            transformed = male_to_female[word]
        elif word in female_to_male:
            transformed = female_to_male[word]
        elif word.lower() in ['her', 'Her', 'HER']:
            if token.tag_ == 'PRP$':
                transformed = {'her': 'his', 'Her': 'His', 'HER': 'HIS'}[word]
            else:
                transformed = {'her': 'him', 'Her': 'Him', 'HER': 'HIM'}[word]
        
        result += transformed
        
        if i < len(doc) - 1:
            result += token.whitespace_
    
    return result

In [7]:
pronoun_testcases['predicted_output'] = pronoun_testcases['input_text'].apply(transform_pronouns)

print(pronoun_testcases[['expected_output', 'predicted_output']].head())

pronoun_accuracy = (pronoun_testcases['expected_output'] == pronoun_testcases['predicted_output']).mean() * 100
print(f"\nPronoun Transformation Accuracy: {pronoun_accuracy:.2f}%")

               expected_output             predicted_output
0  She is going to the market.  She is going to the market.
1    Her book is on the table.    Her book is on the table.
2         I saw her yesterday.         I saw her yesterday.
3            She hurt herself.            She hurt herself.
4     I called her last night.     I called her last night.

Pronoun Transformation Accuracy: 100.00%
