In [None]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('/content/merged_results.csv')

In [None]:
df

Unnamed: 0,index,prediction
0,0,2.63 inch
1,1,6.0 inch
2,2,70.0 centimetre
3,3,7.0 inch
4,4,5.0 inch
...,...,...
131182,131283,1.500 pound
131183,131284,100.0 gram
131184,131285,6.0 ton
131185,131286,955.0 gram


In [None]:
# Define entity_unit_map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon',
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}



Cleaning completed and saved to 'final_test.csv'. Number of invalid entity_value types: 1623.


In [None]:
# Function to validate if a value matches valid units
def is_valid_unit(value, entity_unit_map):
    for entity, units in entity_unit_map.items():
        for unit in units:
            if unit in value:
                return True
    return False

# Function to clean prediction values
def clean_prediction(value):
    # Handle empty values
    if not value or pd.isna(value):
        return '', False  # Invalid value (empty)

    # Remove the word 'to' and anything following it
    value = re.sub(r'\s*to.*', '', value)

    # Regex pattern to match ranges like '[123.0, 555.0] unit' and single values like '[2000.0] unit'
    range_pattern = re.compile(r'\[(\d+\.\d+|\d+),\s*(\d+\.\d+|\d+)\]\s*([a-zA-Z\s]+)')
    single_value_pattern = re.compile(r'\[(\d+\.\d+|\d+)\]\s*([a-zA-Z\s]+)')
    valid_value_pattern = re.compile(r'(\d+\.\d+|\d+)\s*([a-zA-Z\s]+)')

    # Handle ranges
    match_range = range_pattern.match(value)
    if match_range:
        num1, num2, unit = float(match_range.group(1)), float(match_range.group(2)), match_range.group(3).strip()
        if is_valid_unit(unit, entity_unit_map):
            return f'{max(num1, num2)} {unit}', True  # Valid value
        else:
            return '', False  # Invalid unit

    # Handle single values with brackets
    match_single = single_value_pattern.match(value)
    if match_single:
        num, unit = float(match_single.group(1)), match_single.group(2).strip()
        if is_valid_unit(unit, entity_unit_map):
            return f'{num} {unit}', True  # Valid value
        else:
            return '', False  # Invalid unit

    # Handle correctly formatted single values without brackets
    match_valid = valid_value_pattern.match(value)
    if match_valid:
        num, unit = match_valid.group(1), match_valid.group(2).strip()
        if is_valid_unit(unit, entity_unit_map):
            return f'{num} {unit}', True  # Valid value
        else:
            return '', False  # Invalid unit

    # Handle invalid patterns (e.g., '1.0 horsepower', '13.23.0 centimetre', etc.)
    return '', False  # Invalid value

In [None]:
# Counter for invalid values
invalid_count = 0

# Apply the function to the 'prediction' column and track invalid values
def clean_and_count_invalid(value):
    global invalid_count
    cleaned_value, is_valid = clean_prediction(value)
    if not is_valid:
        invalid_count += 1
    return cleaned_value

# Assuming your dataframe is loaded as df
df['prediction'] = df['prediction'].apply(clean_and_count_invalid)

# Save the updated dataframe to a new CSV file
df.to_csv('final_test.csv', index=False)

# Output the number of invalid values
print(f"Cleaning completed and saved to 'final_test.csv'. Number of invalid entity_value types: {invalid_count}.")


In [None]:
!python sanity.py --test_filename /content/test.csv --output_filename /content/final_test.csv