In [20]:
!nvidia-smi

Mon Sep 16 09:48:55 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 561.09         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   63C    P8              6W /   75W |     485MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [21]:
%pip install -q pint ultralytics==8.0.196 pytesseract

In [22]:
! apt install tesseract-ocr
! apt install libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [23]:
import os
import re

import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import pint
from pytesseract import image_to_string
import ultralytics
import json

In [24]:
ultralytics.checks()

Ultralytics YOLOv8.0.196 🚀 Python-3.10.12 torch-2.3.0+cu121 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
Setup complete ✅ (12 CPUs, 7.6 GB RAM, 107.5/1006.9 GB disk)


In [25]:
df = pd.read_csv('dataset/sample_test.csv')

model = ultralytics.YOLO('model/best.pt')

ureg = pint.UnitRegistry()

In [26]:
def load_image(image_link, size):
    data = requests.get(image_link).content
    img = Image.open(BytesIO(data))
    img.thumbnail((size, size), Image.LANCZOS)
    new_img = Image.new("RGB", (size, size), "white")
    new_img.paste(img, ((new_img.width - img.width) // 2, (new_img.height - img.height) // 2))
    return new_img


def predict(img, conf, class_name):
    results = model(img, conf=conf, verbose=False, classes=[classToID(class_name)])[0]

    output_image = Image.fromarray(results.plot())
    detections = json.loads(results.tojson())

    if not os.path.exists('tmp'):
        os.makedirs('tmp')
    for root, dirs, files in os.walk('tmp'):
        for file in files:
            os.remove(os.path.join(root, file))

    results.save_crop(save_dir='tmp', file_name='cropped')

    return output_image, detections


def classToID(class_name):
    if class_name == 'dimension':
        return 0
    if class_name == 'voltage':
        return 1
    if class_name == 'volume':
        return 2
    if class_name == 'wattage':
        return 3
    if class_name == 'weight':
        return 4


def entityToClass(class_name):
    if class_name in ['width', 'depth', 'height']:
        return 'dimension'
    if class_name in ['item_weight', 'maximum_weight_recommendation']:
        return 'weight'
    if class_name in ['item_volume']:
        return 'volume'
    return class_name


def cleanText(data):
    output = []
    for i in data:
        i = i.lower().replace(",", ".").replace("”", "in")
        text = re.sub(r"[^A-Za-z-\d.”]" , '', i)
        output.append(text)
    return output


def findLargest(data, unit):
    unit_map = {"width": "centimetre", "depth": "centimetre", "height": "centimetre", "item_weight": "kilogram", "maximum_weight_recommendation": "kilogram", "item_volume": "millilitre", "voltage": "volt", "wattage": "watt"}
    converted_max_value = 0
    max_value = 0
    max_unit = ""

    for item in data:
        try:
            item = ureg.Quantity(item.lower())
            converted_item = item.to(unit_map[unit])
            if converted_item.magnitude > converted_max_value:
                converted_max_value = converted_item.magnitude
                max_value = item.magnitude
                max_unit = item.units
        except:
            continue
    if max_value == 0:
        return ""
    return str(float(max_value)) + " " + str(max_unit)


def benchmark(n, seed):
    df = pd.read_csv('dataset/train.csv')

    df = df.sample(n, random_state=seed)

    tp, fp, fn = 0, 0, 0

    for _, row in df.iterrows():
        class_name = entityToClass(row['entity_name'])

        img = load_image(row['image_link'], size = 1056)

        img, detections = predict(img, conf = 0.3, class_name = class_name)

        i = 1
        if detections == []:
            text = ""
            fn += 1
            continue

        data = []
        for _ in detections:
            if i == 1:
                no = ""
            else:
                no = str(i)
            i += 1

            file_dir = "tmp/" + class_name + f"/cropped{no}.jpg"

            cropped_img = Image.open(file_dir)

            text = image_to_string(cropped_img)
            data.append(text)
        data = cleanText(data)
        text = findLargest(data, row['entity_name'])

        if text == row['entity_value']:
            tp += 1
        else:
            fp += 1
    f1 = 2 * tp / (2 * tp + fp + fn)
    return f1


In [27]:
print(benchmark(1000, 123))

0.20948970456580127


In [28]:
output_df = pd.DataFrame(columns=['index', 'prediction'])

for index, row in df.iterrows():
    class_name = entityToClass(row['entity_name'])

    img = load_image(row['image_link'], size = 1056)

    img, detections = predict(img, conf = 0.3, class_name = class_name)

    i = 1
    if detections == []:
        text = ""
        output_df.loc[len(output_df)] = {'index': index, 'prediction': text}
        continue

    data = []
    for detection in detections:
        if i == 1:
            no = ""
        else:
            no = str(i)
        i += 1

        file_dir = "tmp/" + class_name + f"/cropped{no}.jpg"

        cropped_img = Image.open(file_dir)

        text = image_to_string(cropped_img)
        data.append(text)
    data = cleanText(data)
    text = findLargest(data, row['entity_name'])

    output_df.loc[len(output_df)] = {'index': index, 'prediction': text}

output_df.to_csv('test_out.csv', index=False)