## **Post Processing**

In [69]:
def valid(val):
    val = str(val)
    if r'.' in val:
        return float(val)
    else:
        return int(val)

In [95]:
import re

additional_patterns = {'width': r'(\d+(?:\.\d+)?)\s*(mm|mms|millimeters?|milimetres?|millimetre|centimeters?|centimetres?|centimetre|cm|cms|meters?|metres?|metre|m|ms|kilometers?|kilometres?|kilometre|km|kms|inches?|inch|"|ft|feet?|\'|yards?|yard|yd|yds)',
'depth': r'(\d+(?:\.\d+)?)\s*(mm|mms|millimeters?|milimetres?|millimetre|centimeters?|centimetres?|centimetre|cm|cms|meters?|metres?|metre|m|ms|kilometers?|kilometres?|kilometre|km|kms|inches?|inch|"|ft|feet?|\'|yards?|yard|yd|yds)',
'height': r'(\d+(?:\.\d+)?)\s*(mm|mms|millimeters?|milimetres?|millimetre|centimeters?|centimetres?|centimetre|cm|cms|meters?|metres?|metre|m|ms|kilometers?|kilometres?|kilometre|km|kms|inches?|inch|"|ft|feet?|\'|yards?|yard|yd|yds)',

'item_weight': r'(\d+(?:\.\d+)?)\s*(g|gs|grams?|grammes?|gram|kg|kgs|kilograms?|kilogrammes?|kilogram|mg|mgs|milligrams?|milligrammes?|milligram|µg|micrograms?|microgrammes?|microgram|oz|ozs|ounces?|ounce|lb|lbs|pounds?|pound|t|tons?|tonne|tonnes?)',
'maximum_weight_recommendation': r'(\d+(?:\.\d+)?)\s*(g|gs|grams?|grammes?|gram|kg|kgs|kilograms?|kilogrammes?|kilogram|mg|mgs|milligrams?|milligrammes?|milligram|µg|micrograms?|microgrammes?|microgram|oz|ozs|ounces?|ounce|lb|lbs|pounds?|pound|t|tons?|tonne|tonnes?)',

'voltage': r'(\d+(?:\.\d+)?)\s*(V|Vs|volts?|volt|kV|kVs|kilovolts?|kilovolt|mV|mVs|millivolts?|millivolt|µV|µVs|microvolts?|microvolt)',
'wattage': r'(\d+(?:\.\d+)?)\s*(W|Ws|watts?|watt|kW|kWs|kilowatts?|kilowatt|MW|MWs|megawatts?|megawatt)',

'item_volume': r'(\d+(?:\.\d+)?)\s*(ml|mls|milliliters?|millilitres?|millilitre|milliliter|l|ls|liters?|litres?|litre|liter|cl|cls|centiliters?|centilitres?|centilitre|centiliter|dl|dls|deciliters?|decilitres?|decilitre|deciliter|µl|µls|microliters?|microlitres?|microlitre|microliter|gal|gals|gallons?|gallon|imperial\s*gallons?|imperial\s*gallon|cups?|cup|fl\s*oz|fl\s*ozs|fluid\s*ounces?|fluid\s*ounce|pt|pts|pints?|pint|qt|qts|quarts?|quart|cubic\s*foot|cubic\s*feet|cubic\s*inch(?:es)?)',
}

def standardize_unit(value, unit, attr):
    unit = unit.lower()
    
    # Standardize width, depth, height, length
    if attr in ['width', 'depth', 'height', 'length']:
        if unit in ['mm', 'millimetre', 'millimeter', 'mms', 'millimeters', 'milimetres']:
            return valid(value), 'millimetre'
        elif unit in ['cm', 'centimetre', 'centimeter', 'centimetres', 'centimeters']:
            return valid(value), 'centimetre'
        elif unit in ['m', 'metre', 'meter', 'metres', 'meters']:
            return valid(value), 'metre'
        elif unit in ['km', 'kilometre', 'kilometer', 'kilometres', 'kilometers', 'kms']:
            return valid(value), 'kilometre'
        elif unit in ['in', 'inch', 'inches', '"']:
            return valid(value), 'inch'
        elif unit in ['ft', 'foot', 'feet', "'"]:
            return valid(value), 'foot'
        elif unit in ['yd', 'yard', 'yards', 'yds']:
            return valid(value), 'yard'
        else:
            return valid(value), 'metre'

    # Standardize weight attributes
    elif attr in ['item_weight', 'maximum_weight_recommendation', 'weight']:
        if unit in ['g', 'gram', 'grams', 'gs']:
            return valid(value), 'gram'
        elif unit in ['mg', 'milligram', 'milligrams', 'mgs']:
            return valid(value), 'milligram'
        elif unit in ['kg', 'kilogram', 'kilograms', 'kgs', 'kilogrammes']:
            return valid(value), 'kilogram'
        elif unit in ['µg', 'microgram', 'micrograms', 'microgrammes']:
            return valid(value), 'microgram'
        elif unit in ['oz', 'ounce', 'ounces', 'ozs']:
            return valid(value), 'ounce'
        elif unit in ['lb', 'pound', 'pounds', 'lbs']:
            return valid(value), 'pound'
        elif unit in ['t', 'ton', 'tons', 'tonne', 'tonnes']:
            return valid(value), 'ton'
        else:
            return valid(value), 'kilogram'

    # Standardize voltage
    elif attr == 'voltage':
        if unit in ['mv', 'millivolt', 'millivolts', 'mvs']:
            return valid(value), 'millivolt'
        elif unit in ['kv', 'kilovolt', 'kilovolts', 'kvs']:
            return valid(value), 'kilovolt'
        elif unit in ['v', 'volt', 'volts', 'vs']:
            return valid(value), 'volt'

    # Standardize wattage
    elif attr == 'wattage':
        if unit in ['kw', 'kilowatt', 'kilowatts', 'kws']:
            return valid(value), 'kilowatt'
        elif unit in ['mw', 'megawatt', 'megawatts', 'mws']:
            return valid(value) * 1000, 'kilowatt'
        elif unit in ['w', 'watt', 'watts', 'ws']:
            return valid(value), 'watt'

    # Standardize item volume
    elif attr == 'item_volume':
        if 'cubic' in unit:
            if 'foot' in unit or 'ft' in unit:
                return valid(value), 'cubic foot'
            elif 'inch' in unit or 'in' in unit:
                return valid(value), 'cubic inch'
        elif unit in ['ml', 'millilitre', 'milliliter', 'mls', 'milliliters', 'millilitres']:
            return valid(value), 'millilitre'
        elif unit in ['l', 'litre', 'liter', 'ls', 'liters', 'litres']:
            return valid(value), 'litre'
        elif unit in ['cl', 'centilitre', 'centiliter', 'cls', 'centiliters', 'centilitres']:
            return valid(value), 'centilitre'
        elif unit in ['dl', 'decilitre', 'deciliter', 'dls', 'decilitres', 'deciliters']:
            return valid(value), 'decilitre'
        elif unit in ['µl', 'microlitre', 'microliter', 'µls', 'microlitres', 'microliters']:
            return valid(value), 'microlitre'
        elif unit in ['gal', 'gallon', 'gallons', 'gals']:
            return valid(value), 'gallon'
        elif 'imperial' in unit:
            return valid(value), 'imperial gallon'
        elif unit in ['fl oz', 'fluid ounce', 'fluid ounces', 'fl ozs']:
            return valid(value), 'fluid ounce'
        elif unit in ['cup', 'cups']:
            return valid(value), 'cup'
        elif unit in ['pt', 'pint', 'pints']:
            return valid(value), 'pint'
        elif unit in ['qt', 'quart', 'quarts']:
            return valid(value), 'quart'
        else:
            return valid(value), 'litre'

    return valid(value), unit


In [96]:
def extract_product_info(text, key):
  """Extracts information for the specified key from the text.

  Args:
      text (str): The text to extract information from.
      key (str): The attribute to extract (e.g., 'width', 'weight').

  Returns:
      str: The extracted information or None if not found.
  """
  if key == None:
    return None
  text = str(text)
  match = re.search(additional_patterns[key], text, re.IGNORECASE)
  if match:
    try:
      value, unit = match.groups()
      # print('step11111')
      value, unit = standardize_unit(value, unit, key)
      # print('step2222')
      # print(value, unit, 'truth')
      return f"{value} {unit}"
    except:
      # print(text + ' ' + key)
      print(match.groups(), key)
      return None
  else:
    # print(text + ' ' + key)
    return None
  

In [97]:
import pandas as pd

def get_entity_name_by_index(df, index):
  """
  Retrieves the entity name for a given index value from the DataFrame.

  Args:
      df (pandas.DataFrame): The DataFrame containing the data.
      index (int): The index value to look up.

  Returns:
      str: The entity name corresponding to the index, or None if not found.

  Raises:
      ValueError: If the index value is out of bounds.
  """

  # Check if index is within valid range
  if not (0 <= index < 150000):
    raise ValueError(f"Invalid index value: {index}. Index must be between 0 and {len(df)-1}.")
  row = df[df.iloc[:, 0] == index]

  if row.empty:
    print(f"SN value '{index}' not found.")
    return None
  # print(row)
  return row.iloc[0, 3]


df_for_entity = pd.read_csv("resources/dataset/test.csv")

# # Get entity name for index 1
# entity_name = get_entity_name_by_index(df_for_entity, 1)

# print(entity_name)  # Output: width

In [None]:
import pandas as pd

def process_csv(input_file, output_file):
  """
  Processes a CSV file, extracts units from the second column, and writes the results to a new CSV file.

  Args:
    input_file (str): The path to the input CSV file.
    output_file (str): The path to the output CSV file.
  """

  df = pd.read_csv(input_file)

  # Create an empty list to store the results
  results = []

  # Iterate through each row and apply the function
  for index, row in df.iterrows():
    unit = extract_product_info(row[1], get_entity_name_by_index(df_for_entity, row[0]))
    # print(str(row[1:]))
    results.append([row[0], unit])

  # Create a new DataFrame from the results and save it to a CSV file
  new_df = pd.DataFrame(results, columns=['index', 'prediction'])
  new_df.to_csv(output_file, index=False)

# Example usage
input_file = "thanos.csv"
output_file = "unit_out.csv"
process_csv(input_file, output_file)

In [99]:
import pandas as pd

def create_ultimate_csv(unit_out_file, ultimate_file, index_df):
    """
    Creates a new CSV file "ultimate.csv" by merging data from "unit_out.csv" and iterating
    through the index values provided in another DataFrame.

    Args:
        unit_out_file (str): Path to the "unit_out.csv" file.
        ultimate_file (str): Path to the output "ultimate.csv" file.
        index_df (pd.DataFrame): DataFrame containing the indices to iterate through (from its first column).
    """

    # Read "unit_out.csv" into a DataFrame
    unit_out_df = pd.read_csv(unit_out_file)

    # Create a DataFrame to store the results
    results = []

    # Extract the index values from the first column of index_df
    index_values = index_df.iloc[:, 0].values

    # Iterate through the indices from the index DataFrame
    for index in index_values:
        if index in unit_out_df['index'].values:
            row = unit_out_df[unit_out_df['index'] == index]
            results.append([index, row['prediction'].values[0]])
        else:
            results.append([index, None])

    # Create a new DataFrame and save it to "ultimate.csv"
    ultimate_df = pd.DataFrame(results, columns=['index', 'prediction'])
    ultimate_df.to_csv(ultimate_file, index=False)

# Example usage
unit_out_file = "unit_out.csv"
ultimate_file = "infinity.csv"

# Assuming 'index_df' is the DataFrame containing the indices in its first column
index_df = pd.read_csv("resources/dataset/test.csv")  # Replace with the actual file path
create_ultimate_csv(unit_out_file, ultimate_file, index_df)
