In [1]:
!pip uninstall -y tensorflow
!pip install tensorflow-cpu
!pip install easyocr


Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0
Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting ml-dtypes<0.5.0,>=0.3.1 (from tensorflow-cpu)
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorboard<2.18,>=2.17 (from tensorflow-cpu)
  Downloading tensorboard-2.17.1-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.2.0 (from tensorflow-cpu)
  Downloading keras-3.5.0-py3-none-any.whl.metadata (5.8 kB)
Collecting namex (from keras>=3.2.0->tensorflow-cpu)
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting optree (from keras>=3.2.0->tensorflow-cpu)
  Downloading optree-0.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.

In [None]:
import os
import pandas as pd
from PIL import Image
import re
import easyocr
reader = easyocr.Reader(['en'])
# reader = easyocr.Reader(["en"],gpu=True,quantize=True,)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Abbreviation map
unit_abbreviation_map = {
    'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre', 'in': 'inch', 'ft': 'foot', 'yd': 'yard',
    'g': 'gram', 'kg': 'kilogram', 'mg': 'milligram', 'oz': 'ounce', 'lb': 'pound', 't': 'ton',
    'kV': 'kilovolt', 'V': 'volt', 'mV': 'millivolt',
    'kW': 'kilowatt', 'W': 'watt',
    'cl': 'centilitre', 'ml': 'millilitre', 'l': 'litre', 'fl oz': 'fluid ounce', 'gal': 'gallon',
    'pt': 'pint', 'qt': 'quart', 'cu ft': 'cubic foot', 'cu in': 'cubic inch'
}

# Entity-unit map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce',
                    'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

In [5]:
# Function to get all relevant units and abbreviations for an entity
def get_entity_units(entity_name):
    if entity_name not in entity_unit_map:
        return set()

    units = entity_unit_map[entity_name]
    # Find abbreviations that map to these units
    abbreviations = {abbr for abbr, full in unit_abbreviation_map.items() if full in units}
    # Combine full units and their abbreviations
    return units.union(abbreviations)

# Function to extract entity values with abbreviation handling
def extract_entity_values(text, entity_name):
    valid_units = get_entity_units(entity_name)
    if not valid_units:
        return []

    # Build regex pattern to match numbers followed by valid units or their abbreviations
    # Sort by length to handle multi-word units like 'fluid ounce'
    sorted_units = sorted(valid_units, key=lambda x: -len(x))
    pattern = r'(\d+(?:\.\d+)?)\s*(' + '|'.join(map(re.escape, sorted_units)) + r')\b'

    # Find all matches
    matches = re.findall(pattern, text)

    extracted = []
    for value, unit in matches:
        # Convert abbreviation to full unit name if necessary
        full_unit = unit_abbreviation_map.get(unit, unit)
        extracted.append((float(value), full_unit))

    return extracted

# Function to select the appropriate value based on entity rules
def select_entity_value(extracted_values, entity_name):
    if not extracted_values:
        return ""

    # Extract numerical values
    values = [val for val, unit in extracted_values]

    if entity_name == 'height':
        selected = max(values)
    elif entity_name == 'width':
        if len(values) >= 2:
            sorted_vals = sorted(values, reverse=True)
            selected = sorted_vals[1]  # Second maximum
        else:
            selected = max(values)
    elif entity_name == 'depth':
        selected = min(values)
    elif entity_name == 'maximum_weight_recommendation':
        selected = max(values)
    else:
        selected = values[0]  # Default to first value

    # Find the corresponding unit
    for val, unit in extracted_values:
        if val == selected:
            return f"{val} {unit}"

    return ""  # Fallback in case something goes wrong

# Function to process all texts for a specific entity
def process_texts(texts, entity_name):
    all_extracted = []
    for text in texts:
        extracted = extract_entity_values(text, entity_name)
        all_extracted.extend(extracted)

    selected_value = select_entity_value(all_extracted, entity_name)
    return selected_value

In [6]:
# Predictor function integrating OCR and entity extraction
from io import BytesIO
import requests

def predictor(image_link, category_id, entity_name):

    # Extract image name from the link
    image_name = os.path.basename(image_link)

    try:
        # Load the image
        response = requests.get(image_link)

        # Open the image using PIL
        image = Image.open(BytesIO(response.content))
        # Run OCR
        results = reader.readtext(image, detail=0)
        # Extract texts from OCR predictions
        extracted_texts = results
        prediction = process_texts(extracted_texts, entity_name)
        return prediction

    except FileNotFoundError:
        print(f"Image {image_name} not found in the folder.")
        return None

In [None]:
# Main execution
if __name__ == "__main__":

    # Load test CSV
    output_filename = '/content/drive/MyDrive/template-new/dataset/t81764-85000.csv'
    test = pd.read_csv('/content/drive/MyDrive/template-new/dataset/test.csv')
    test_subset = test.iloc[81764:850000]

# Write headers to the CSV file (to create the file)
    with open(output_filename, mode='w') as f:
        f.write("index,prediction\n")

# Run predictions and append each result to the CSV file
    for index, row in test_subset.iterrows():
        prediction = predictor(row['image_link'], row['group_id'], row['entity_name'])
        print("predicted index: ", index)
        with open(output_filename, mode='a') as f:
          f.write(f"{row['index']},{prediction}\n")

predicted index:  81764
predicted index:  81765
predicted index:  81766
predicted index:  81767
predicted index:  81768
predicted index:  81769
predicted index:  81770
predicted index:  81771
predicted index:  81772
predicted index:  81773
predicted index:  81774
predicted index:  81775
predicted index:  81776
predicted index:  81777
predicted index:  81778
predicted index:  81779
predicted index:  81780
predicted index:  81781
predicted index:  81782
predicted index:  81783
predicted index:  81784
predicted index:  81785
predicted index:  81786
predicted index:  81787
predicted index:  81788
predicted index:  81789
predicted index:  81790
predicted index:  81791
predicted index:  81792
predicted index:  81793
predicted index:  81794
predicted index:  81795
predicted index:  81796
predicted index:  81797
predicted index:  81798
predicted index:  81799
predicted index:  81800
predicted index:  81801
predicted index:  81802
predicted index:  81803
predicted index:  81804
predicted index: