## **ENTITY VALUE EXTRACTION**

### 0. SETTING UP

In [8]:
import os
import cv2

In [9]:
# DATABASE LOCATION
dataset_path = "data/"

train_path = dataset_path + "train.csv"
test_path = dataset_path + "test.csv"
sample_test_path = dataset_path + "sample_test.csv"

In [10]:
# test if paths exist
import os

for path in [train_path, test_path, sample_test_path]:
    if not os.path.exists(path):
        print(f"Missing CSV File: {path}")

### 1. HELPER FUNCTIONS

In [67]:
def display_image(image):
    plt.figure(figsize = (10, 5))
    plt.imshow(image)
    plt.axis('off')
    plt.show()

In [87]:
def find_all_indices(s, substring):
    pattern = re.compile(re.escape(substring))
    matches = pattern.finditer(s)
    return [match.start() for match in matches]

### 2. TRAINING

In [12]:
import re
from PIL import Image
import matplotlib.pyplot as plt

from paddleocr import PaddleOCR, draw_ocr

from constants import unit_variations, entity_unit_map

In [124]:
image_path = dataset_path + "special_images/image_2.jpg"
image_entity = "item_weight"
font_path = "fonts/latin.ttf"

#### (i) PREPROCESSING

In [141]:
image = cv2.imread(image_path)

In [142]:
def preprocess_image(image):
    pre_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    """
    pre process some tings
    """

    return pre_image

In [143]:
image = preprocess_image(image)

In [None]:
print("Preprocessed Image")
display_image(pre_image)

#### (ii) TEXT RECOGNITION

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en')

In [145]:
def get_ocr(image_array):
    return ocr.ocr(pre_image)

In [146]:
def show_ocr(result):
    boxes = [item[0] for item in result[0]]
    texts = [item[1][0] for item in result[0]]
    scores = [item[1][1] for item in result[0]]

    im_show = draw_ocr(image, boxes, texts, scores, font_path="fonts/latin.ttf")
    display_image(im_show)

In [None]:
result = ocr.ocr(pre_image)

In [None]:
print("OCR Output:")
show_ocr(result)

#### (iii) POST PROCESSING

In [134]:
def extract(result):
    extracted_measurements = set()

    for r in result[0]:
        location, line = r
        text = line[0]
        # print("full text:", text)

        for unit in entity_unit_map[image_entity]:
            for rep in unit_variations[unit]:
                indices = find_all_indices(text, rep)

                for index in indices:
                    if index <= 0:
                        continue

                    numbers = [""]
                    reformat = text[:index].replace(' ', '')

                    i = len(reformat) - 1

                    while (i >= 0) and (reformat[i].isnumeric() or reformat[i] in ['-', '.', ',']):
                        char = reformat[i]

                        if char.isnumeric():
                            numbers[-1] = char + numbers[-1]
                        elif char in ['.', ',']:
                            numbers[-1] = '.' + numbers[-1]
                        elif char in ['-']:
                            numbers.append("")

                        i -= 1

                    discard_rule = lambda x: len(x.strip().replace('.', '')) > 0
                    print(numbers)
                    numbers = filter(discard_rule, numbers)
                    numbers = sorted(list(map(lambda x: float(x), numbers)))

                    if len(numbers) == 0:
                        continue

                    measurement = f"[{numbers[0]}, {numbers[1]}] {unit}" if len(numbers) > 1 else f"{numbers[0]} {unit}"
                    extracted_measurements.add(measurement)

    return extracted_measurements

### 3. VALIDATION

In [5]:
import pandas as pd

In [12]:
from IPython.display import clear_output

In [6]:
image_folder = "data/train_images"
validation_path = "data/downloaded_train.csv"

In [7]:
validation = pd.read_csv(validation_path)

In [13]:
for index, row in validation.iterrows():
    if not row["downloaded"]:
        print(row)

        """
        prediction
        """

        clear_output(wait = True)

image_link      https://m.media-amazon.com/images/I/DzP2RMRQO0...
group_id                                                   898898
entity_name                                           item_weight
entity_value                                       100.0 kilogram
downloaded                                                  False
Name: 7754, dtype: object
