In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re

# Unit normalization dictionary
unit_normalization_map = {
    'lbs': 'pound', 'lb': 'pound', 'pounds': 'pound', 'pound': 'pound',
    'kg': 'kilogram', 'kgs': 'kilogram', 'kilograms': 'kilogram', 'kilogram': 'kilogram',
    'g': 'gram', 'grams': 'gram', 'gram': 'gram',
    'mg': 'milligram', 'milligrams': 'milligram', 'milligram': 'milligram',
    'oz': 'ounce', 'ounces': 'ounce', 'ounce': 'ounce',
    'ton': 'ton', 'tons': 'ton',
    'cm': 'centimetre', 'centimetres': 'centimetre', 'centimeter': 'centimetre', 'centimetres': 'centimetre', 'centimetre': 'centimetre',
    'mv': 'millivolt', 'millivolts': 'millivolt', 'millivolt': 'millivolt',
    'mm': 'millimetre', 'millimeters': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'meters': 'metre', 'metres': 'metre', 'metre' : 'metre',
    'ft': 'foot', 'feet': 'foot', 'foot': 'foot',
    'yard': 'yard', 'yards': 'yard',
    'kv': 'kilovolt', 'kilovolts': 'kilovolt', 'kilovolt': 'kilovolt',
    'v': 'volt', 'volts': 'volt', 'volt': 'volt',
    'w': 'watt', 'watts': 'watt', 'watt': 'watt',
    'kw': 'kilowatt', 'kilowatts': 'kilowatt', 'kilowatt': 'kilowatt',
    'ml': 'millilitre', 'milliliters': 'millilitre', 'millilitres': 'millilitre',
    'litres': 'litre', 'litre': 'litre', 'l': 'litre',
    'gallons': 'gallon', 'gallon': 'gallon',
    'pints': 'pint', 'pint': 'pint',
    'quarts': 'quart', 'quart': 'quart',
    'cups': 'cup', 'cup': 'cup',
    'fluid ounces': 'ounce', 'fluid ounce': 'ounce',
    'cl': 'centilitre', 'centilitres': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'deciliters': 'decilitre', 'decilitre': 'decilitre',
    'imperial gallons': 'gallon', 'imperial gallon': 'gallon',
    'in': 'inch',
    "''": 'inch', '"': 'inch',
    "'": 'foot'
}

def parse_into_valid_form(entity_value):
    # Normalize and clean the input string
    entity_value = entity_value.lower().strip()

    # Replace comma with period for decimal conversion
    entity_value = entity_value.replace(',', '.')

    # Remove invalid characters using regex (keep numbers, letters, and common unit symbols)
    entity_value = re.sub(r'[^\d\w.\'\"]+', '', entity_value)

    # Extract the number using regex
    number_match = re.search(r'[-+]?\d*\.?\d+', entity_value)
    if not number_match:
        return ""

    number = number_match.group(0)

    # Remove the number part from the entity_value to isolate the unit
    unit_part = entity_value.replace(number, '').strip()

    # Check if the unit part is concatenated with the number
    if not unit_part:
        for unit_variation in unit_normalization_map:
            # Look for concatenated unit (e.g., '12cm')
            concatenated_match = re.search(rf'{number}({unit_variation})', entity_value)
            if concatenated_match:
                unit_part = concatenated_match.group(1)
                break

    # Normalize the unit part
    valid_unit = False
    for unit_variation, normalized_unit in unit_normalization_map.items():
        if unit_variation in unit_part:
            unit = normalized_unit
            valid_unit = True
            break

    if not valid_unit:
        return ""

    # Convert the number to a float
    try:
        number = float(number)
    except ValueError:
        return ""

    # Return the cleaned number and unit in valid form
    return f"{number} {unit}"

# Get a list of all CSV files in the current directory
csv_files = ['/content/drive/MyDrive/output_1.csv', '/content/drive/MyDrive/output_2.csv', '/content/drive/MyDrive/output_3.csv', '/content/drive/MyDrive/output_4.csv', '/content/drive/MyDrive/output_5.csv','/content/drive/MyDrive/output_8a.csv','/content/drive/MyDrive/output_9.csv']

# Initialize an empty list to store DataFrames
dfs = []

# Read each CSV file and append it to the list of DataFrames
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames and drop duplicates based on the "index" column
merged_df = pd.concat(dfs).drop_duplicates(subset='index', keep='first').reset_index(drop=True)

# Apply the parse_into_valid_form function to the "prediction" column
merged_df['prediction'] = merged_df['prediction'].apply(lambda x: parse_into_valid_form(str(x)))
merged_df = merged_df.loc[:, ~merged_df.columns.str.match("^Unnamed")]


In [None]:
merged_df

Unnamed: 0,index,prediction
0,0,
1,1,
2,2,
3,3,
4,4,
...,...,...
130995,131096,
130996,131097,
130997,131098,
130998,131099,


In [None]:
merged_df.to_csv('final_output_3.csv', index=False)
print("Final output saved to final_output.csv")

Final output saved to final_output.csv


In [None]:
import pandas as pd

# Assuming merged_df already exists
last_value = merged_df['index'].iloc[-1]  # Get the last value in the 'index' column

# Create a new DataFrame with new index values, and fill other columns with empty strings
new_rows = pd.DataFrame({'index': range(last_value + 1, 131288)})
for col in merged_df.columns:
    if col != 'index':  # Ensure the 'index' column is populated correctly, other columns are empty strings
        new_rows[col] = ""

# Append the new rows to merged_df
merged_df = pd.concat([merged_df, new_rows], ignore_index=True)

print(merged_df)


         index prediction
0            0           
1            1           
2            2           
3            3           
4            4           
...        ...        ...
131182  131283        NaN
131183  131284        NaN
131184  131285        NaN
131185  131286        NaN
131186  131287        NaN

[131187 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np

# Assuming merged_df already exists and has a 'prediction' column

# Replace NaN values in the 'prediction' column with empty string
merged_df['prediction'] = merged_df['prediction'].replace(np.nan, "")

print(merged_df)


         index prediction
0            0           
1            1           
2            2           
3            3           
4            4           
...        ...        ...
131182  131283           
131183  131284           
131184  131285           
131185  131286           
131186  131287           

[131187 rows x 2 columns]
