# Read the result of Task 1 and finish the Task 2

In [1]:
import os
import re

In [2]:
#load data of task1
test_label_path = "./submission/answer.txt"

In [3]:
def create_label_dict(label_path):
    label_dict = {}  # y
    with open(label_path, "r", encoding="utf-8-sig") as f:
        file_text = f.read().strip()  

    # (id, label, start, end, query) or (id, label, start, end, query, time_org, timefix)
    for line in file_text.split("\n"):
        sample = line.split("\t")  
        sample[2], sample[3] = int(sample[2]), int(sample[3])

        if sample[0] not in label_dict:
            label_dict[sample[0]] = [sample[1:]]
        else:
            label_dict[sample[0]].append(sample[1:])

    return label_dict

test_label_dict = create_label_dict(test_label_path)

In [4]:
# check num(ID)
print(len(list(test_label_dict.keys())))

560


In [5]:
def process_labels(test_label_dict, label_type, normalize_label_func):
    """
    Process labels for a specific label type in the test label dictionary.

    Parameters:
    - test_label_dict (dict): Dictionary containing labels for the test dataset.
    - label_type (str): Specific label type to be processed.
    - normalize_label_func (function): Function to normalize label text.

    Returns:
    - processed_label_dict (dict): Processed label dictionary with normalized label text.
    - label_ids (list): List of record IDs that have labels of the specified type.

    This function processes labels for a specific label type, normalizes the label text, and returns the processed label dictionary along with a list of record IDs containing labels of the specified type.
    
    """
    processed_label_dict = {}
    label_ids = []

    for record_id, labels in test_label_dict.items():
        normalized_labels = []

        for label in labels:
            if label[0] == label_type:
                normalized_text = normalize_label_func(label[3])
                normalized_label = label[:3] + [label[3], normalized_text]
                normalized_labels.append(normalized_label)
            else:
                normalized_labels.append(label)

        if any(label[0] == label_type for label in normalized_labels):
            label_ids.append(record_id)

        processed_label_dict[record_id] = normalized_labels
    return processed_label_dict, label_ids

def check_label_results(processed_label_dict, label_ids, label_type):
    """
    Check and print the original and normalized label results for a specific label type.

    Parameters:
    - processed_label_dict (dict): Processed label dictionary.
    - label_ids (list): List of record IDs containing labels of the specified type.
    - label_type (str): Specific label type to check.

    This function checks and prints the original and normalized label results for a specific label type in the processed label dictionary.


    """
    for record_id in label_ids:
        for label in processed_label_dict[record_id]:
            if label[0] == label_type:
                print(f"ID: {record_id}, Original: {label[3]}, {label_type}: {label[4]}")

# Check the Duration

In [6]:
def normalize_duration(label):
    """
    Normalize duration labels to a standard format (ISO 8601 Duration).

    Parameters:
    - label (str): Original duration label to be normalized.

    Returns:
    - str: Normalized duration label in ISO 8601 Duration format.

    This function takes an original duration label and normalizes it to the ISO 8601 Duration format. It supports various input formats like 'two years', '2-3 months', '3 w', etc., and converts them to the standardized format 'P2Y3M', representing 2 years and 3 months.

    """
    number_map = {
        'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5',
        'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10',
        'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fourteen': '14',
        'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 'eighteen': '18',
        'nineteen': '19', 'twenty': '20', 'thirty': '30', 'forty': '40', 'fifty': '50'
    }

    label_text = label.lower()
    standard_form = ''

    temp = label_text.split()
    words = [number_map.get(word, word) for word in temp]

    k = 0
    for word in words:
        if any(char.isdigit() for char in word):
            break
        k += 1

    if k == len(words):
        return '<unknown>'

    label_text = ' '.join(words[k:])

    match = re.match(r'(\d+|\w+)(-)?(\d+)?\s*(year|yr|month|week|day|w|m|y|d)s?', label_text)
    if match:
        num1, hyphen, num2, unit = match.groups()

        if hyphen and num2:
            avg_num = (float(num1) + float(num2)) / 2
            num1 = str(avg_num)

        unit_map = {'year': 'Y', 'yr': 'Y', 'month': 'M', 'week': 'W', 'day': 'D',
                    'y': 'Y', 'm': 'M', 'w': 'W', 'd': 'D'}
        unit = unit_map.get(unit, unit)

        return f'P{num1}{unit}'

    return '<unknown>'


In [7]:
processed_duration_dict, duration_ids = process_labels(test_label_dict, 'DURATION', normalize_duration)

In [8]:
print(duration_ids)

['1023', '1029', '1055', '180', '689', '887', '894', 'file30591']


In [9]:
#check the results of processed_test_label_dict
check_label_results(processed_duration_dict, duration_ids, 'DURATION')

ID: 1023, Original: 2 years, DURATION: P2Y
ID: 1029, Original: 18/12, DURATION: <unknown>
ID: 1055, Original: several years, DURATION: <unknown>
ID: 180, Original: 2 years, DURATION: P2Y
ID: 689, Original: 18 months, DURATION: P18M
ID: 887, Original: Now 4-5 month, DURATION: P4.5M
ID: 894, Original: 16 yr, DURATION: P16Y
ID: file30591, Original: 6 weeks, DURATION: P6W


In [10]:
### DURATION of validation data
# {'1023': [['DURATION', 365, 372, '2 years', 'P2Y']],
#  '180': [['DURATION', 363, 370, '2 years', 'P2Y']],
#  '689': [['DURATION', 372, 381, '18 months', 'P18M']],
#  '887': [['DURATION', 410, 419, '4-5 month', 'P4.5M']],
#  '894': [['DURATION', 358, 363, '16 yr', 'P16Y']],
#  'file30591': [['DURATION', 232, 239, '6 weeks', 'P6W']]}

# need more postprocess, delete the error DURATION

# Check the Labels Set

In [11]:
def normalize_set_label(original_text):
    """
    Normalize duration labels to a standard format (ISO 8601 Duration).

    Parameters:
    - label (str): Original duration label to be normalized.

    Returns:
    - str: Normalized duration label in ISO 8601 Duration format.

    This function takes an original duration label and normalizes it to the ISO 8601 Duration format. It supports various input formats like 'two years', '2-3 months', '3 w', etc., and converts them to the standardized format 'P2Y3M', representing 2 years and 3 months.

    """
    set_rules = {
        'twice': 'R2',
        'years': 'RP1D',
        # possible??
        'daily': 'R1D',
        'weekly': 'R1W',
        'monthly': 'R1M',
        'annually': 'R1Y',
    }

    if original_text in set_rules:
        return set_rules[original_text]
    else:
        return original_text
 

In [12]:
processed_set_dict, set_ids = process_labels(test_label_dict, 'SET', normalize_set_label)

In [13]:
#check the results of processed_test_label_dict
check_label_results(processed_set_dict, set_ids, 'SET')

In [14]:
### there is no SET in Validation data
# careful of SET, maybe we can use training data to generate prediction, then try to compare SET
normalize_set_label('twice')

'R2'

# Check the DATE

In [15]:
def standardize_date_format(date_str):
    """
    Standardize the date format from various representations to the 'YYYY-MM-DD' format.

    Parameters:
    - date_str (str): Original date string in various formats.

    Returns:
    - str: Standardized date string in 'YYYY-MM-DD' format or '0000-00-00' for invalid cases.

    This function takes a date string in various representations and standardizes it to the 'YYYY-MM-DD' format. It supports a wide range of date formats, including different delimiters, month representations, and variations. If the input date string cannot be parsed, '0000-00-00' is returned.

    """
    date_str = date_str.lower()
    # "30th of the 5th 2016" 
    if match := re.match(r'(\d{1,2})(?:st|nd|rd|th)? of the (\d{1,2})(?:st|nd|rd|th)? (\d{4})', date_str):
        day, month, year = match.groups()
        day = day.zfill(2)
        month = month.zfill(2)
        return f"{year}-{month}-{day}"    
    
    # remove the, th, rd...(the 12th of December 2013)
    date_str = re.sub(r'the\s*(\d{1,2})(st|nd|rd|th)?', r'\1', date_str)    
    
    # solve 12/.03.14 ---> 12.03.14
    date_str = re.sub(r'([/.-])([/.-])', r'/', date_str)
        
    # month (12 of December 2013)
    month_map = {
        'jan': '01', 'january': '01',
        'feb': '02', 'february': '02',
        'mar': '03', 'march': '03',
        'apr': '04', 'april': '04',
        'may': '05',
        'jun': '06', 'june': '06',
        'jul': '07', 'july': '07',
        'aug': '08', 'august': '08',
        'sep': '09', 'sept': '09', 'september': '09',
        'oct': '10', 'october': '10',
        'nov': '11', 'november': '11',
        'dec': '12', 'december': '12'
    }
    
    # YYYY-MM-DD (2063-05-29)
    if re.match(r'\d{4}-\d{2}-\d{2}', date_str):
        return date_str
    # DD-MM-YYYY
    elif match := re.match(r'(\d{2})-(\d{2})-(\d{4})', date_str):
        day, month, year = match.groups()
        return f"{year}-{month}-{day}"
    
    # DDMMYYYY (02/01/2013)
    elif match := re.match(r'(\d{2})/(\d{2})/(\d{4})', date_str):
        day, month, year = match.groups()
        return f"{year}-{month}-{day}"

    # YYYYMMDD (20630201)
    elif re.match(r'\d{8}', date_str):
        return f"{date_str[0:4]}-{date_str[4:6]}-{date_str[6:8]}"
    
    #  solve 17/32064 or 1/32064
    elif match := re.match(r'(\d{1,2})/(\d{1,2})(\d{4})', date_str):
        day, month, year = match.groups()
        day = day.zfill(2)
        month = month.zfill(2)
        return f"{year}-{month}-{day}"
    
    # solve DD.MM.YYYY or D/M/YYYY or D.M.YYYY 格式
    elif re.match(r'\d{1,2}[./,]\d{1,2}[./,]\d{2,4}', date_str):
        parts = re.split(r'[./,]', date_str)
        day, month = parts[0], parts[1]
        year = parts[2] if len(parts[2]) == 4 else f"20{parts[2]}"
        day = f"0{day}" if len(day) == 1 else day
        month = f"0{month}" if len(month) == 1 else month
        return f"{year}-{month}-{day}"     

    # solve YY
    elif match := re.match(r'(\d{1,2})[./-](\d{1,2})[./-](\d{2})', date_str):
        day, month, year = match.groups()
        day = day.zfill(2)
        month = month.zfill(2)
        year = f"20{year}"
        return f"{year}-{month}-{day}"        
    
    # only MM/YYYY 
    elif match := re.match(r'(\d{1,2})/(\d{4})', date_str):
        month, year = match.groups()
        month = f"0{month}" if len(month) == 1 else month
        return f"{year}-{month}"      

    # solve /4/2062
    elif match := re.match(r'/(\d{1,2})/(\d{4})', date_str):
        month, year = match.groups()
        month = f"0{month}" if len(month) == 1 else month
        return f"{year}-{month}"
    
    # special case 
    elif match := re.match(r'(\d{3})/(\d{1,2})/(\d{4})', date_str):
        day, month, year = match.groups()
        day = f"0{day}" if len(day) == 1 else day
        month = f"0{month}" if len(month) == 1 else month
        return f"{year}-{month}-{day}"
    
    #  MM/YY
    elif match := re.match(r'(\d{1,2})/(\d{2})', date_str):
        month, year = match.groups()
        month = f"0{month}" if len(month) == 1 else month
        year = f"20{year}"
        return f"{year}-{month}"

    # DD-MMM-YY or DD-MMM-YYYY 
    elif match := re.match(r'(\d{1,2})-([a-zA-Z]{3})-(\d{2,4})', date_str):
        day, month_str, year = match.groups()
        month = month_map.get(month_str.lower(), '01')
        day = day.zfill(2)
        year = year if len(year) == 4 else f"20{year}"
        return f"{year}-{month}-{day}"    

    # 21 6.62
    elif match := re.match(r'(\d{1,2})[ ./-](\d{1,2})[ ./-](\d{2})', date_str):
        day, month, year = match.groups()
        day = day.zfill(2)
        month = month.zfill(2)
        year = f"20{year}"
        return f"{year}-{month}-{day}"
    
    # 0408.13 -->2013.04.08
    elif match := re.match(r'(\d{2})(\d{2})\.(\d{2})', date_str):
        month, day, year = match.groups()
        year = f"20{year}"
        return f"{year}-{month}-{day}"
    
    # YYYY
    elif re.match(r'\d{4}', date_str):
        return f"{date_str}"
    
    # YYY
    elif re.match(r'\d{3}', date_str):
        year = f"2{date_str}"
        return f"{year}"
    
    # solve /7/62
    elif match := re.match(r'/(\d{1,2})/(\d{2})', date_str):
        month, year = match.groups()
        month = f"0{month}" if len(month) == 1 else month
        year = f"20{year}"
        return f"{year}-{month}"
    
    #  the DDth of MMMM YYYY
    elif match := re.match(r'the (\d{1,2})(?:st|nd|rd|th)? of ([a-zA-Z]+) (\d{4})', date_str):
        day, month_str, year = match.groups()
        month = month_map.get(month_str.lower()[:3], '01')
        day = f"0{day}" if len(day) == 1 else day
        return f"{year}-{month}-{day}"    
    
    # English month (03-Sep-2013)
    elif match := re.match(r'(\d{1,2})(?:st|nd|rd|th)?\s*of\s*([a-zA-Z]+)\s*(\d{2,4})', date_str):
        day, month_str, year = match.groups()
        month = month_map.get(month_str.lower()[:3], '01')
        day = f"0{day}" if len(day) == 1 else day
        year = year if len(year) == 4 else f"20{year}"
        return f"{year}-{month}-{day}"
    # September 2013
    elif match := re.match(r'([a-zA-Z]+)\s*(\d{2,4})', date_str):
        month_str, year = match.groups()
        month = month_map.get(month_str.lower()[:3], '01')
        return f"{year}-{month}"
    # 21 June 1962
    elif match := re.match(r'(\d{1,2})\s([a-zA-Z]+)\s(\d{2,4})', date_str):
        day, month_str, year = match.groups()
        month = month_map.get(month_str.lower()[:3], '01')
        day = f"0{day}" if len(day) == 1 else day
        year = year if len(year) == 4 else f"20{year}"
        return f"{year}-{month}-{day}"

    else:
        return '0000-00-00'
#        return date_str



In [16]:
print(standardize_date_format("the 12th of December 2013"))  
print(standardize_date_format("03-Sep-2013")) 
print(standardize_date_format("05/04/2013gbujvjv"))
print(standardize_date_format('0408.13'))

2013-12-12
2013-09-03
2013-04-05
2013-04-08


In [17]:
def normalize_date_label_for_record(record_id, label_dict):
    """
    Normalize date labels for a specific medical record.

    Parameters:
    - record_id (str): The ID of the medical record.
    - label_dict (dict): A dictionary containing the labels for different records.

    Returns:
    - list: A list of normalized labels for the specified medical record.

    This function takes a medical record ID and a dictionary of labels for different records. It returns a list of normalized labels for the specified medical record. Date labels are normalized using the 'standardize_date_format' function. If the record ID is not found in the label dictionary, an empty list is returned.

    """

    if record_id not in label_dict:
        return []

    normalized_labels = []

    # define now, previous, today, original
    special_dates = {
        "today": "2062-09-13",
        "previous": "2064-05-27",
        "now": "2062-11-27",
        "original": "2063-10-30"
    }

    for label in label_dict[record_id]:
        if label[0] == 'DATE':
            date_str = label[3].lower()  

            if date_str in special_dates:
                formatted_date = special_dates[date_str]  
            else:
                formatted_date = standardize_date_format(label[3]) 

            normalized_label = label[:3] + [label[3], formatted_date]
            normalized_labels.append(normalized_label)
        else:
            normalized_labels.append(label)  

    return normalized_labels


In [18]:
processed_date_dict, date_ids = process_labels(test_label_dict, 'DATE', standardize_date_format)


In [19]:
#check date_ids
#print(date_ids)

In [20]:
#check_label_results(processed_date_dict, date_ids, 'DATE')

In [21]:
#check the results of processed_test_label_dict
for record_id in date_ids:
    for label in processed_date_dict[record_id]:
        if label[0] == 'DATE' and label[4]=='0000-00-00':
            print(f"ID: {record_id}, Original: {label[3]}, DATE: {label[4]}")

ID: 1032, Original: Now, DATE: 0000-00-00
ID: 1050, Original: today, DATE: 0000-00-00
ID: 1055, Original: now, DATE: 0000-00-00
ID: 1057, Original: ED, DATE: 0000-00-00
ID: 1059, Original: now, DATE: 0000-00-00
ID: 1084, Original: today, DATE: 0000-00-00
ID: 250, Original: 26th of November, DATE: 0000-00-00
ID: 583, Original: original, DATE: 0000-00-00
ID: 591, Original: 69, DATE: 0000-00-00
ID: 672, Original: now, DATE: 0000-00-00
ID: 689, Original: Now, DATE: 0000-00-00
ID: 697, Original: now, DATE: 0000-00-00
ID: 721, Original: today, DATE: 0000-00-00
ID: 722, Original: today, DATE: 0000-00-00
ID: 724, Original: today, DATE: 0000-00-00
ID: 735, Original: original, DATE: 0000-00-00
ID: 751, Original: 3, DATE: 0000-00-00
ID: 751, Original: 52, DATE: 0000-00-00
ID: 768, Original: originally, DATE: 0000-00-00
ID: 768, Original: original, DATE: 0000-00-00
ID: 820, Original: now, DATE: 0000-00-00
ID: 875, Original: now, DATE: 0000-00-00
ID: 878, Original: now, DATE: 0000-00-00
ID: 893, Or

In [22]:
## in validation dataset, today, previous, now and original are constants
    # special_dates = {
    #     "today": "2062-09-13",
    #     "previous": "2064-05-27",
    #     "now": "2062-11-27",
    #     "original": "2063-10-30"
    # }
    
# we need to fix it

# Check the TIME

In [None]:

def standardize_time(time_str):
    """
    Standardize the format of time strings.

    Parameters:
    - time_str (str): The input time string.

    Returns:
    - str: The standardized time string in the format "HH:MM" or "00:00" if the input is invalid.

    This function takes an input time string and standardizes its format. It handles various formats, including those with or without minutes, seconds, and AM/PM indicators. The resulting string is in the format "HH:MM" or "00:00" if the input is invalid.

    """

    # check "00:00:00" it needs to fix
    if time_str == "00:00:00":
        return "00:00:00"
    elif re.match(r'\d{2}:\d{2}:\d{2}', time_str):
        return time_str[:5] 

    # fix '.' to ':', 13.55-->13:55remove 
    time_str = re.sub(r'\.', ':', time_str)
    
    # remove ' ', ' am'-->'am'
    time_str = re.sub(r'[^\d:apm]', '', time_str, flags=re.IGNORECASE)
        
    match = re.match(r'(\d{1,2}):?(\d{2})?([ap]m)?', time_str, re.IGNORECASE)
    if match:
        hour, minute, period = match.groups()
        hour = int(hour)

        minute = minute if minute else '00'

        if period:
            if period.lower() == 'pm' and hour < 12:
                hour += 12
            # elif period.lower() == 'am' and hour == 12:
            #     hour = 0

        return f"{str(hour).zfill(2)}:{minute}"
    else:
        
        return time_str


In [23]:
def standardize_datetime_format(datetime_str):
    """
    Standardize the format of datetime strings.

    Parameters:
    - datetime_str (str): The input datetime string.

    Returns:
    - str: The standardized datetime string in the format "YYYY-MM-DDTHH:MM" or "T00:00" if date part is not present.


    This function takes an input datetime string and standardizes its format. It handles various formats, including those with date and time, only date, or only time. The resulting string is in the format "YYYY-MM-DDTHH:MM" or "T00:00" if the date part is not present.

    """

    date_part, time_part = '', ''
    
    # remove hours or hrs, and use : to tag time    
    datetime_str = re.sub(r'(\d{2})(\d{2})\s*(hours|hrs)', r'\1:\2', datetime_str, flags=re.IGNORECASE)
    datetime_str = re.sub(r'(\d{2})[.: -](\d{2})\s*(hours|hrs)', r'\1:\2', datetime_str, flags=re.IGNORECASE)

    # 'a.m.'  'p.m.'  'am'  'pm'，
    datetime_str = re.sub(r'a\.m\.', 'am', datetime_str, flags=re.IGNORECASE)
    datetime_str = re.sub(r'p\.m\.', 'pm', datetime_str, flags=re.IGNORECASE)

    # remove space of 'am' and 'pm'
    datetime_str = re.sub(r'\s+(am|pm)', r'\1', datetime_str, flags=re.IGNORECASE)  
    
    # add space to 'at' and 'on' 
    datetime_str = datetime_str.replace('at', ' at ').replace('on', ' on ')
    
    #  solve "11.25" to 11:25
    #datetime_str = re.sub(r'\s(\d{2})\.(\d{2})\s', r' \1:\2 ', datetime_str)
    datetime_str = re.sub(r'(^|\s)(\d{2})\.(\d{2})($|\s)', r'\1\2:\3\4', datetime_str)

    # solve 'art' to 'at'
    datetime_str = re.sub(r'(^|\s)([a-zA-Z])([a-zA-Z])([a-zA-Z])($|\s)', 
                          lambda m: ' at ' if {'a', 't'}.issubset({m.group(2).lower(), m.group(3).lower(), m.group(4).lower()}) else m.group(0), 
                          datetime_str, flags=re.IGNORECASE)
    
    datetime_str = re.sub(r'(^|\s)([a-zA-Z])([a-zA-Z])([a-zA-Z])($|\s)', 
                          lambda m: ' on ' if {'o', 'n'}.issubset({m.group(2).lower(), m.group(3).lower(), m.group(4).lower()}) else m.group(0), 
                          datetime_str, flags=re.IGNORECASE)
    # solve 'o' to 'on'
    datetime_str = re.sub(r'\s[aA]\s', ' at ', datetime_str)
    datetime_str = re.sub(r'\s[oO]\s', ' on ', datetime_str)
    
    datetime_str = datetime_str.replace(', ', ' ')
    
    # fix ' 1 '
    datetime_str = re.sub(r'\s\d\s', ' ', datetime_str)
    
    parts = datetime_str.split()

    if len(parts) == 1:
        # date
        if ':' in parts[0] or 'am' in parts[0].lower() or 'pm' in parts[0].lower():
            time_part = parts[0]
        else:
            date_part = parts[0]
    elif len(parts) == 2:
        # date time
        if ':' in parts[0] or 'am' in parts[0].lower() or 'pm' in parts[0].lower():
            time_part, date_part = parts
        elif 'at' in parts[0]:
            time_part = parts[1]
        elif 'on' in parts[0]:
            date_part = parts[1]
        elif '.' in parts[1]:
            time_part, date_part = parts
        else:
            date_part, time_part = parts
    elif len(parts) == 3:
        # date at time, time on date
        if parts[1] == 'at' or parts[1] == '@':
            if ':' in parts[0] or 'am' in parts[0].lower() or 'pm' in parts[0].lower():
                time_part, date_part = parts[0], parts[2]
            else:
                date_part, time_part = parts[0], parts[2]
        elif parts[0] == 'at':
            if ':' in parts[1] or 'am' in parts[1].lower() or 'pm' in parts[1].lower():
                time_part, date_part = parts[1], parts[2]
            else:
                date_part, time_part = parts[1], parts[2]                
        elif parts[1] == 'on' or  parts[1] == 'and':
            if ':' in parts[2] or 'am' in parts[2].lower() or 'pm' in parts[2].lower():
                date_part, time_part = parts[0], parts[2]
            else:
                time_part, date_part = parts[0], parts[2]
        elif ':' in parts[0] or 'am' in parts[0].lower() or 'pm' in parts[0].lower():
            date_part = ' '.join(parts[1:])
            time_part = ' '.join(parts[0])
        elif ':' in parts[-1] or 'am' in parts[-1].lower() or 'pm' in parts[-1].lower():
            date_part = ' '.join(parts[:-1])
            time_part = ' '.join(parts[-1])
               
    elif len(parts) >= 4:
        # at and on (at 3.36 pm on 04.08.13)
        if 'at' in parts and 'on' in parts:
            at_index = parts.index('at')
            on_index = parts.index('on')
            if at_index < on_index:
                # 'at' front
                time_part = ' '.join(parts[at_index + 1:on_index])
                date_part = ' '.join(parts[on_index + 1:])
            else:
                # 'on' front
                date_part = ' '.join(parts[on_index + 1:at_index])
                time_part = ' '.join(parts[at_index + 1:])
        elif 'on' in parts:
            # only 'on'
            on_index = parts.index('on')
            after_on = ''.join(parts[on_index + 1:])  
            before_on = ''.join(parts[:on_index])    

            if any(x in parts[on_index - 1].lower() for x in [':', 'am', 'pm']):
                time_part = ' '.join(parts[:on_index])
                date_part = ' '.join(parts[on_index + 1:])
                
            elif after_on.isdigit() and len(after_on) == 4:
                time_part = after_on
                date_part = ' '.join(parts[:on_index])
            elif before_on.isdigit() and len(before_on) == 4:
                time_part = before_on
                date_part = ' '.join(parts[on_index + 1:])
            
            else:
                date_part = ' '.join(parts[:on_index])
                time_part = ' '.join(parts[on_index + 1:])
        elif 'at' in parts:
            # only 'at'
            at_index = parts.index('at')
            after_at = ''.join(parts[at_index + 1:])  
            before_at = ''.join(parts[:at_index])    
            
            if any(x in parts[at_index + 1].lower() for x in [':', 'am', 'pm']):
                time_part = ' '.join(parts[at_index + 1:])
                date_part = ' '.join(parts[:at_index])
            elif after_at.isdigit() and len(after_at) == 4:
                time_part = after_at
                date_part = ' '.join(parts[:at_index])
            elif before_at.isdigit() and len(before_at) == 4:
                time_part = before_at
                date_part = ' '.join(parts[at_index + 1:])            
            else:
                date_part = ' '.join(parts[at_index + 1:])
                time_part = ' '.join(parts[:at_index])
                
        elif ':' in parts[0] or 'am' in parts[0].lower() or 'pm' in parts[0].lower():
            date_part = ' '.join(parts[1:])
            time_part = ' '.join(parts[0])
        elif ':' in parts[-1] or 'am' in parts[-1].lower() or 'pm' in parts[-1].lower():
            date_part = ' '.join(parts[:-1])
            time_part = ' '.join(parts[-1])

    if date_part != '':
        standardized_date = standardize_date_format(date_part.strip())
        standardized_time = standardize_time(time_part.strip()) if time_part else "00:00"
        result = f"{standardized_date}T{standardized_time}"
    else:
        standardized_time = standardize_time(time_part.strip()) if time_part else "00:00"
        result = f"T{standardized_time}"
        
    return result


In [25]:
print(standardize_datetime_format("1300 hours on the 16th of December 2013"))
print(standardize_datetime_format("4.35pm"))

2013-12-16T13:00
T16:35


In [26]:
def normalize_time_labels_for_records(record_ids, label_dict):
    normalized_results = {}
    for record_id in record_ids:
        if record_id in label_dict:
            normalized_labels = []
            for label in label_dict[record_id]:
                if label[0] == 'TIME':
                    formatted_date = standardize_datetime_format(label[3])
                    normalized_label = label[:3] + [label[3], formatted_date]
                    normalized_labels.append(normalized_label)
                else:
                    normalized_labels.append(label)  
            normalized_results[record_id] = normalized_labels
    return normalized_results

In [27]:
record_ids = test_label_dict.keys()

In [28]:
#print(record_ids)

In [29]:
processed_time_dict = normalize_time_labels_for_records(record_ids, test_label_dict)

In [33]:
# # #check time result
# for record_id, labels in processed_time_dict.items():
#     for label in labels:
#         if label[0] == 'TIME':
#             print(f"ID: {record_id}, Original: {label[3]}, TIME: {label[4]}")

# Process all labels for Task 2

In [34]:
def process_all_labels(label_dict):
    """
    Process and normalize all labels in the given label dictionary.

    Parameters:
    - label_dict (dict): A dictionary containing label information for multiple records.

    Returns:
    - dict: A dictionary containing processed and normalized label information.

    This function takes a dictionary of label information for multiple records and processes all labels, normalizing them based on their types (DURATION, SET, DATE, TIME). The resulting dictionary contains the processed and normalized label information for each record.

    """

    processed_label_dict = {}

    special_dates = {
        "today": "2062-09-13",
        "previous": "2064-05-27",
        "now": "2062-11-27",
        "original": "2063-10-30"
    }
    
    for record_id, labels in label_dict.items():
        processed_labels = []
        for label in labels:
            label_type = label[0]
            original_text = label[3]
            normalized_label = None

            if label_type == 'DURATION':
                normalized_label = normalize_duration(original_text)
                
            elif label_type == 'SET':
                normalized_label = normalize_set_label(original_text)
                
            elif label_type == 'DATE':
                if original_text in special_dates:                    
                    normalized_label = special_dates[original_text]
                else:
                    normalized_label = standardize_date_format(original_text)
                    
            elif label_type == 'TIME':
                normalized_label = standardize_datetime_format(original_text)

            if label_type in ['DURATION','SET','DATE','TIME']:
                processed_label = label[:3] + [original_text, normalized_label]
            else:
                processed_label = label

            processed_labels.append(processed_label)

        processed_label_dict[record_id] = processed_labels

    return processed_label_dict

In [35]:
# do it all
processed_test_label_dict = process_all_labels(test_label_dict)

In [36]:
#double check 
for record_id in processed_test_label_dict.keys():
    for label in processed_test_label_dict[record_id]:
        if label[0] == 'DATE' and label[4]=='0000-00-00':
            print(f"ID: {record_id}, Original: {label[3]}, DATE: {label[4]}")

ID: 1032, Original: Now, DATE: 0000-00-00
ID: 1057, Original: ED, DATE: 0000-00-00
ID: 250, Original: 26th of November, DATE: 0000-00-00
ID: 591, Original: 69, DATE: 0000-00-00
ID: 689, Original: Now, DATE: 0000-00-00
ID: 751, Original: 3, DATE: 0000-00-00
ID: 751, Original: 52, DATE: 0000-00-00
ID: 768, Original: originally, DATE: 0000-00-00
ID: 893, Original: LV, DATE: 0000-00-00
ID: file20864, Original: REC, DATE: 0000-00-00


In [37]:
#double check 
for record_id in processed_test_label_dict.keys():
    for label in processed_test_label_dict[record_id]:
        if label[0] == 'SET':
            print(f"ID: {record_id}, Original: {label[3]}, SET: {label[4]}")

In [38]:
#double check 
for record_id in processed_test_label_dict.keys():
    for label in processed_test_label_dict[record_id]:
        if label[0] == 'DURATION':
            print(f"ID: {record_id}, Original: {label[3]}, DURATION: {label[4]}")

ID: 1023, Original: 2 years, DURATION: P2Y
ID: 1029, Original: 18/12, DURATION: <unknown>
ID: 1055, Original: several years, DURATION: <unknown>
ID: 180, Original: 2 years, DURATION: P2Y
ID: 689, Original: 18 months, DURATION: P18M
ID: 887, Original: Now 4-5 month, DURATION: P4.5M
ID: 894, Original: 16 yr, DURATION: P16Y
ID: file30591, Original: 6 weeks, DURATION: P6W


In [39]:
# #double check 
# for record_id in processed_test_label_dict.keys():
#     for label in processed_test_label_dict[record_id]:
#         if label[0] == 'TIME':
#             print(f"ID: {record_id}, Original: {label[3]}, TIME: {label[4]}")

# Write output answer file

In [40]:
def write_processed_labels_to_file(processed_label_dict, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for sample_id, labels in processed_label_dict.items():
            for label in labels:
                output_line = f"{sample_id}\t{label[0]}\t{label[1]}\t{label[2]}\t{label[3]}"

                if len(label) > 4:
                    output_line += f"\t{label[4]}"

                output_line += "\n"
                f.write(output_line)



In [41]:

output_path = "./submission/final_answer.txt"

write_processed_labels_to_file(processed_test_label_dict, output_path)