In [None]:
import numpy as np
import re 
import json
import ast
import random

### Examples of the LLM-Generated Function Set for each dataset

### Rayyan
- the sub-function Dataset_Detect_attr is generated by LLM
- we only concat all the sub-functions to a uniformed input-output 
    - Rayyan_Row_Correction: take the dirty cell_value and attribute as input, try to correct it with clean value
    - Rayyan_Row_Generate: take the clean cell_value and attribute as input, inject noise with regex
    - Rayyan_Row_Detect: take cell_value and attribute as input, judge whether the cell is dirty or not by regex

In [None]:
def Rayyan_Detect_atitle(cell):
    # Regular expression to detect if the cell contains special characters or combining diacritical marks
    pattern = re.compile(r'[�\u0300-\u036F]')
    return bool(pattern.search(cell))
def Rayyan_Generate_atitle(cell):
    # List of special characters and combining diacritical marks
    special_chars = ['�', '\u0301', '\u0300', '\u0302', '\u0303', '\u0304']
    
    # Randomly choose a special character
    char = random.choice(special_chars)
    
    # Randomly choose a position to insert the special character
    position = random.randint(0, len(cell))
    
    # Insert the special character at the chosen position
    dirty_cell = cell[:position] + char + cell[position:]
    
    return dirty_cell
def Rayyan_Clean_atitle(cell):
    # Remove special characters like �
    cleaned = re.sub(r'�', '', cell)
    
    # Remove combining diacritical marks (from Unicode range U+0300 to U+036F)
    cleaned = re.sub(r'[\u0300-\u036F]', '', cleaned)
    
    return cleaned
def Rayyan_Detect_jtitle(cell):
    # Regular expression to detect if the cell contains special characters or combining diacritical marks
    pattern = re.compile(r'[�\u0300-\u036F]')
    return bool(pattern.search(cell))
def Rayyan_Generate_jtitle(cell):
    # List of special characters and combining diacritical marks
    special_chars = ['�', '\u0301', '\u0300', '\u0302', '\u0303', '\u0304']
    
    # Randomly choose a special character
    char = random.choice(special_chars)
    
    # Randomly choose a position to insert the special character
    position = random.randint(0, len(cell))
    
    # Insert the special character at the chosen position
    dirty_cell = cell[:position] + char + cell[position:]
    
    return dirty_cell
def Rayyan_Clean_jtitle(cell):
    # Remove special characters like �
    cleaned = re.sub(r'�', '', cell)
    
    # Remove combining diacritical marks (from Unicode range U+0300 to U+036F)
    cleaned = re.sub(r'[\u0300-\u036F]', '', cleaned)
    
    return cleaned
def Rayyan_Detect_author(cell):
    # Regular expression to detect if the cell contains special characters or combining diacritical marks
    pattern = re.compile(r'[�\u0300-\u036F]')
    return bool(pattern.search(cell))
def Rayyan_Generate_author(cell):
    # List of special characters and combining diacritical marks
    special_chars = ['�', '\u0301', '\u0300', '\u0302', '\u0303', '\u0304']
    
    # Randomly choose a special character
    char = random.choice(special_chars)
    
    # Randomly choose a position to insert the special character
    position = random.randint(0, len(cell))
    
    # Insert the special character at the chosen position
    dirty_cell = cell[:position] + char + cell[position:]
    
    return dirty_cell
def Rayyan_Detect_issn(cell):
    # Regular expression to detect if the cell is in the format Mon-DD
    date_pattern = re.compile(r'^[A-Za-z]{3}-\d{1,2}$')
    
    # Regular expression to detect if the ISSN starts with a number other than 9
    issn_pattern = re.compile(r'^[^9]\d{12}$')
    
    return bool(date_pattern.match(cell) or issn_pattern.match(cell))
def Rayyan_Correct_issn(cell):
    date_match = re.match(r'([A-Za-z]{3})-(\d{1,2})', cell)
    if date_match:
        return f"{date_match.group(2)}-{date_match.group(1)}"
    
    # If the cell matches the ISSN format not starting with '9', replace the leading digit with '9'
    issn_match = re.match(r'^[^9]\d{12}$', cell)
    if issn_match:
        return f"9{cell[1:]}"
    
    return cell  # If no patterns match, return the original cel
def Rayyan_Generate_issn(cell):
    # If the cell matches the format DD-Mon, reverse it to Mon-DD
    date_match = re.match(r'(\d{1,2})-([A-Za-z]{3})', cell)
    if date_match:
        return f"{date_match.group(2)}-{date_match.group(1)}"
    
    # If the cell matches the ISSN format starting with '9', change the leading '9' to another number
    issn_match = re.match(r'^9\d{12}$', cell)
    if issn_match:
        return f"{random.choice(['0', '1', '2', '3', '4', '5', '6', '7', '8'])}{cell[1:]}"
    
    return cell
def Rayyan_Detect_jissue(cell):
    pattern = re.compile(r'^\s*$')
    return bool(pattern.match(cell))
def Rayyan_Generate_jissue(cell):
    return ""
def Rayyan_Correct_jissue(cell):
    return "-1"
def Rayyan_Detect_pagination(cell):
    clean_pattern = re.compile(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}$')
    
    # Dirty pattern
    dirty_pattern1 = re.compile(r'^\d{2}-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)$')
    dirty_pattern2 = re.compile(r'^\d{2}-\d$')
    
    if clean_pattern.match(cell):
        return False
    elif dirty_pattern1.match(cell) or dirty_pattern2.match(cell):
        return True
    else:
        return False
def Rayyan_Correct_pagination(cell):
    clean_pattern = re.compile(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}$')
    
    # Dirty pattern
    dirty_pattern1 = re.compile(r'^(\d{2})-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)$')
    dirty_pattern2 = re.compile(r'^(\d{2})-(\d)$')
    
    # If the cell matches the clean pattern, return it as is
    if clean_pattern.match(cell):
        return cell
    
    # If the cell matches the first dirty pattern, reverse month and year
    match = dirty_pattern1.match(cell)
    if match:
        return f"{match.group(2)}-{match.group(1)}"
    
    # If the cell matches the second dirty pattern, reverse month and year
    match = dirty_pattern2.match(cell)
    if match:
        month_map = {
            '1': 'Jan', '2': 'Feb', '3': 'Mar', '4': 'Apr', '5': 'May', '6': 'Jun',
            '7': 'Jul', '8': 'Aug', '9': 'Sep', '10': 'Oct', '11': 'Nov', '12': 'Dec'
        }
        return f"{month_map[match.group(2)]}-{match.group(1)}"
    
    # If the cell doesn't match any pattern, return it as is
    return cell
def Rayyan_Generate_pagination(cell):
    # Extract month and year from the clean format
    match = re.match(r'([A-Za-z]{3})-(\d{2})', cell)
    if not match:
        return cell  # Return the original cell if it doesn't match the clean format
    
    month, year = match.groups()
    
    # Convert month to its corresponding number
    month_to_num = {
        'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4', 'May': '5', 'Jun': '6',
        'Jul': '7', 'Aug': '8', 'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    month_num = month_to_num.get(month, '')
    
    # Randomly decide to add extra numbers or not
    if random.choice([True, False]):
        year = str(random.randint(1000, 9999)) + year
    
    # Return the dirty format
    return f"{year}-{month_num}"
def Rayyan_Detect_jcreate(cell):
    try:
        match = re.match(r'(\d{2})/(\d{2})/(\d{2})', cell)
        if match:
            YY, MM, DD = match.groups()
            YY = int(YY)
            MM = int(MM)
            DD = int(DD)
            if (1 <= MM <= 12) and (1 <= DD <= 31):
                return True
        return False
    except:
        return False
def Rayyan_Correct_jcreate(cell): ## Dirty -> Clean
    if(cell!=''):
        match = re.match(r'(\d{2})/(\d{2})/(\d{2})', cell)
        if match:
            YY, MM, DD = match.groups()
            YY = '{:02}'.format(int(YY))
            return f'{MM}/{DD}/{YY}'
    return ''
def Rayyan_Generate_jcreate(cell): ## Clean -> Dirty
    if cell:
        match = re.match(r'(\d{2})/(\d{2})/(\d{2})', cell)
        if match:
            MM, DD, YY = match.groups()
            DD = '{:02}'.format(int(DD))
            MM = '{:02}'.format(int(MM))
            return f'{YY}/{MM}/{DD}'
    return ''
def Rayyan_Row_Detect(x,cell):
    if(x=='article_title'):
        return Rayyan_Detect_atitle(cell)
    elif(x=='journal_title'):
        return Rayyan_Detect_jtitle(cell)
    elif(x=='journal_issn'):
        return Rayyan_Detect_issn(cell)
    elif(x=='article_jvolumn') or (x=='article_jissue'): ## Modify when label budget shrink
        return Rayyan_Detect_jissue(cell)
    elif(x=='article_jcreated_at'):
        return Rayyan_Detect_jcreate(cell)
    elif(x=='article_pagination'):
        return Rayyan_Detect_pagination(cell)
    elif(x=='author_list'):
        return Rayyan_Detect_author(cell)
    else:
        return False
# Rayyan_Row_Detect('article_jissue','')
def Rayyan_Row_Generate(x,cell): ## Input should be detected to clean, except jcreate_at
    if(x=='article_title'):
        return Rayyan_Generate_atitle(cell)
    elif(x=='journal_title'):
        return Rayyan_Generate_jtitle(cell)
    elif(x=='journal_issn'):
        return Rayyan_Generate_issn(cell)
    elif(x=='article_jvolumn') or (x=='article_jissue'): ## Modify when label budget shrink
        return Rayyan_Generate_jissue(cell)
    elif(x=='article_jcreated_at'):
        return Rayyan_Generate_jcreate(cell) ## Input Should Be Clean
    elif(x=='article_pagination'):
        return Rayyan_Generate_pagination(cell)
    elif(x=='author_list'):
        return Rayyan_Generate_author(cell)
    else:
        return False
def Rayyan_Row_Correction(x,cell): ## Input should be detected to clean, except jcreate_at
    if(x=='journal_issn'):
        return Rayyan_Correct_issn(cell)
    elif(x=='article_jvolumn') or (x=='article_jissue'): ## Modify when label budget shrink
        return Rayyan_Correct_jissue(cell)
    elif(x=='article_jcreated_at'):
        return Rayyan_Correct_jcreate(cell) ## Input Should Be Clean
    elif(x=='article_pagination'):
        return Rayyan_Correct_pagination(cell)
    elif(x in ['article_title','journal_title','author_list']):
        return Rayyan_Clean_atitle(cell)
    else:
        return False

### Beers

In [None]:
def Beer_Detection_Ounces(cell):
    pattern = re.compile(r'^\d+$')
    return not bool(pattern.match(cell))
def correct_dirty_cell(cell):
    # Regular expression to extract the numeric value from the cell
    match = re.search(r'(\d+(\.\d+)?)', cell)
    if match:
        value = match.group(1)
        # Convert to integer string if the value ends with ".0"
        if value.endswith('.0'):
            return str(int(float(value)))
        return value
    return cell
def Beer_Generation_Ounces(cell):
    descriptors = [" oz.", " ounce", " OZ.", " oz. Alumi-Tek", " oz. Silo Can"]
    value = correct_dirty_cell(cell)
    descriptor = random.choice(descriptors)
    
    # Append the descriptor to the cell to make it dirty
    dirty_cell = value + descriptor
    
    return dirty_cell

def Beer_Detection_abv(cell):
    # Regular expression to detect if the cell contains "%" or has more than three decimal places
    pattern = re.compile(r'%|^\d+\.\d{4,}$')
    return bool(pattern.search(cell))
def Beer_Generation_abv(cell):
    choices = ["append_percent"]
    action = random.choice(choices)
    
    if action == "append_percent":
        return cell + "%"
    elif action == "alter_float" and "." in cell:
        # Introduce a small random change to the floating point
        parts = cell.split(".")
        if len(parts[1]) == 3:
            last_digit = str(int(parts[1][2]) + random.choice([-1, 1]) % 10)  # Increment or decrement the last digit
            return parts[0] + "." + parts[1][:2] + last_digit + "%"
    
def Beer_Detection_city(cell):
    pattern = re.compile(r'\b[A-Z]{2}$')
    return bool(pattern.search(cell))
def Beer_Generation_city(cell):
    state_abbreviations = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
    
    # Randomly choose a state abbreviation
    state = random.choice(state_abbreviations)
    
    # Append the state abbreviation to the cell to make it dirty
    dirty_cell = cell + " " + state
    
    return dirty_cell
def Beer_Detection_state(cell):
    return cell == ""
def Beer_Generation_state(cell):
    return ""
def Beer_Row_Detection(x,y):
    # for x,y in row.items():
        if(x=='ounces'):
            return Beer_Detection_Ounces(y)
        elif(x=='abv'):
            return Beer_Detection_abv(y)
        elif(x=='city'):
            return Beer_Detection_city(y)
        elif(x=='state'):
            return Beer_Detection_state(y)
        else:
            return False
def Beer_Row_Generation(x,y):
    # for x,y in row.items():
        if(x=='ounces'): ## try to Correct Ounces
            return Beer_Generation_Ounces(y)
            # return correct_dirty_cell(y)
        elif(x=='abv'):
            return Beer_Generation_abv(y)
        elif(x=='city'):
            return Beer_Generation_city(y)
        elif(x=='state'):
            return Beer_Generation_state(y)
        else:
            return False


### imdb

In [None]:
def imdb_year_detection(cell):
    """
    Check if the given cell is dirty or not for the startYear column.

    :param cell: A string representing a cell from the startYear column.
    :return: Boolean indicating whether the cell is dirty.
    """
    # Regular expression for a clean cell: exactly four digits
    clean_pattern = r'^\d{4}$'

    # Return True (dirty) if cell does not match the clean pattern
    return not re.match(clean_pattern, cell)
def imdb_year_generation(clean_cell):
    """
    Generate a dirty cell from a clean one. The clean cell is assumed to be
    a four-digit year. The dirty cell will be the last two digits of the year.

    :param clean_cell: A string representing a clean cell (four-digit year).
    :return: A string representing a dirty cell (last two digits of the year).
    """
    # Extract the last two digits of the year
    dirty_cell = clean_cell[-2:]

    return dirty_cell
def imdb_runtime_detection(cell):
    pattern = r'^\d+$'
    return not re.match(pattern, cell)

def imdb_runtime_generation(clean_cell):
    minutes = int(clean_cell)

    # Convert minutes to hours and add a random level of decimal precision
    hours = minutes / 60
    precision = random.choice([1, 2, 3])  # Random precision level
    formatted_hours = round(hours, precision)

    # Format the dirty cell
    dirty_cell = f"{formatted_hours} h"

    return dirty_cell
def imdb_director_detection(cell):
    # Regular expression to detect if the cell contains special characters or combining diacritical marks
    pattern = re.compile(r'[�\u0300-\u036F]')
    return bool(pattern.search(cell))
def imdb_director_generation(cell):
    special_chars = ['�', '\u0301', '\u0300', '\u0302', '\u0303', '\u0304']
    
    # Randomly choose a special character
    char = random.choice(special_chars)
    
    # Randomly choose a position to insert the special character
    position = random.randint(0, len(cell))
    
    # Insert the special character at the chosen position
    dirty_cell = cell[:position] + char + cell[position:]
    
    return dirty_cell
def imdb_title_detection(cell):
    return cell.__contains__('x')
def imdb_title_generation(input_string):
    if not input_string:
        return input_string

    # 随机选择要替换的字符
    char_to_replace = random.choice(input_string)

    # 使用字符串的 replace 方法替换所有相同字符为 'x'
    result_string = input_string.replace(char_to_replace, 'x')

    return result_string
def Imdb_Row_Detect(x,cell):
    if(x=='titleType'):
        return imdb_title_detection(cell)
    elif(x=='title'):
        return imdb_title_detection(cell)
    elif(x=='startYear'):
        return imdb_year_detection(cell)
    elif(x=='runtimeMinutes'):
        return imdb_runtime_detection(cell)
    elif(x=='director'):
        return imdb_director_detection(cell)
    else:
        return False
    
def Imdb_Row_Generate(x,cell):
    if(x=='titleType'):
        return imdb_title_generation(cell)
    elif(x=='title'):
        return imdb_title_generation(cell)
    elif(x=='startYear'):
        return imdb_year_generation(cell)
    elif(x=='runtimeMinutes'):
        return imdb_runtime_generation(cell)
    elif(x=='director'):
        return imdb_director_generation(cell)
    else:
        return False

### Tax

In [None]:
def Tax_Detect_fname(cell):
    return bool(re.search(r"''", cell))
def Tax_Generate_fname(cell):
    # Find all positions of single quotes in the cell
    positions = [i for i, char in enumerate(cell) if char == "'"]
    
    # If there's no single quote, return the original cell
    if not positions:
        return cell
    
    # Randomly choose a position from the positions of single quotes
    chosen_position = random.choice(positions)
    
    # Insert an additional single quote at the chosen position
    dirty_cell = cell[:chosen_position] + "'" + cell[chosen_position:]
    
    return dirty_cell
def Tax_Correct_fname(dirty_cell):
    return dirty_cell.replace("''", "'")
def Tax_Detect_city(cell):
    return bool(re.search(r'-\*$', cell))
def Tax_Generate_city(cell,num_samples=5):
    dirty_cell = cell + '-*'
    return dirty_cell
def Tax_Correct_city(cell):
    # Use regex to remove the '-*' pattern from the end of the string
    corrected_value = re.sub('-\*$', '', cell)
    return corrected_value
def Tax_Detect_zip(cell):
    return cell == '1907'
def Tax_Generate_zip(cell):
    return '1907'
def Tax_Row_Detection(x,cell):
    if(x in ['f_name','l_name']):
        return Tax_Detect_fname(cell)
    elif(x in ['city','state','single_exemp','child_exemp']):
        return Tax_Detect_city(cell)
    elif(x in ['zip']):
        return Tax_Detect_zip(cell)
    else:
        return False
def Tax_Row_Generate(x,cell):
    if(x in ['f_name','l_name']):
        return Tax_Generate_fname(cell)
    elif(x in ['city','state','single_exemp','child_exemp']):
        return Tax_Generate_city(cell)
    elif(x in ['zip']):
        return Tax_Generate_zip(cell)
    else:
        return False
def Tax_Row_Correction(x,cell):
    if(x in ['f_name','l_name']):
        return Tax_Correct_fname(cell)
    elif(x in ['city','state','single_exemp','child_exemp']):
        return Tax_Correct_city(cell)
    else:
        return False   
def Tax_Row_Correction_pd(row):
    
    if(x in ['f_name','l_name']):
        return Tax_Correct_fname(cell)
    elif(x in ['city','state','single_exemp','child_exemp']):
        return Tax_Correct_city(cell)
    else:
        return False   

### Hospital

In [1]:
def replace_random_char_with_x(input_string):
    if not input_string:
        return input_string

    char_to_replace = random.choice(input_string)

    result_string = input_string.replace(char_to_replace, 'x')

    return result_string
def Hospital_Detect(cell):
    # Regular expression to detect the character 'x' in places where it's likely an error
    pattern = re.compile(r'\bx|\bx\b|[^a-z]x[^a-z]', re.IGNORECASE)
    
    if pattern.search(cell):
        return True
    return False
def Hospital_Row_Detection(x,cell):
    return Hospital_Detect(cell)
def Hospital_Row_Generation(x,cell):
    return replace_random_char_with_x(cell)

In [None]:
### Flights
def is_clean_time_format(time_str):
    # Define a regular expression pattern for a clean time format (HH:MM a.m./p.m.)
    time_pattern = r'^\d{1,2}:\d{2} (a\.m\.|p\.m\.)$'
    
    # Use regex to check if the time string matches the expected pattern
    if re.match(time_pattern, time_str):
        return True
    else:
        return False
def Flight_Row_Generation(cell):
    task = [0,1,2,3,4,5,6]
    task_select = np.random.choice(task)
    if(task_select==0):
        cell_output = cell.replace(' ','').replace('.','')
        cell_output = cell_output[:-1] + 'Dec 1'
    elif(task_select==1):
        cell_output = '11/30 ' + cell
    elif(task_select==2):
        cell_output = 'Not Available'
    elif(task_select==3):
        cell_output = cell.replace(' ','').replace('.','')
        cell_output = cell_output[:-2] + 'noon'
    elif(task_select==4):
        cell_output = cell + ' (-00:00)'
    elif(task_select==5):
        cell_output = '12/02/2011 ' + cell
    else:
        cell_output = 'Dec 02 ' + cell
    return cell_output
def Flights_Row_Detection(x,cell):
    return is_clean_time_format(cell)
