In [3]:
import os
import cv2
import re
import easyocr
import spacy
from ultralytics import YOLO
from happytransformer import HappyTextToText, TTSettings

In [4]:
import easyocr
reader = easyocr.Reader(['en'], gpu=True, verbose=True)

In [11]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [12]:
# Load NLP models
nlp = spacy.load("en_core_web_sm")
happy_tt = HappyTextToText("T5", "prithivida/grammar_error_correcter_v1")

# Unwanted words/phrases to remove
EXCLUDED_WORDS = {"ingredients", "may contain", "flavour", "natural", "artificial", "colour"}

01/30/2025 19:23:29 - INFO - happytransformer.happy_transformer -   Using device: cuda:0


In [13]:
def correct_text(text):
    """
    Uses a grammar correction model (T5) to fix OCR errors and improve readability.
    """
    args = TTSettings(num_beams=5, min_length=1)
    corrected = happy_tt.generate_text(text, args)
    return corrected.text.strip()

In [14]:
def clean_and_split_text(text):
    """
    Uses NLP to tokenize text properly and extract only meaningful words.
    """
    doc = nlp(text)
    words = [token.text.lower() for token in doc if token.is_alpha]  # Keeps only valid words
    return words

In [15]:
def extract_ingredients_from_image(image, detections):
    """
    Extracts clean ingredient lists from detected areas using OCR.
    """
    extracted_texts = []

    for detection in detections:
        x1, y1, x2, y2 = map(int, detection['bbox'])

        # Ensure bounding box is within image bounds
        h, w, _ = image.shape
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(w, x2), min(h, y2)

        cropped_region = image[y1:y2, x1:x2]

        # Skip invalid regions
        if cropped_region.size == 0:
            continue

        results = reader.readtext(cropped_region)

        for result in results:
            text = result[1]  # Extract recognized text
            confidence = result[2]

            # Filter out low-confidence detections
            if confidence < 0.5:
                continue

            # Remove standalone numbers (e.g., INS 621)
            text = re.sub(r'\b\d+\b', '', text)

            # Correct OCR mistakes using NLP
            corrected_text = correct_text(text)

            # Clean and split text into structured ingredients
            words = clean_and_split_text(corrected_text)

            # Remove unwanted words (e.g., "ingredients", "flavour")
            words = [word for word in words if word not in EXCLUDED_WORDS]

            extracted_texts.extend(words)

    return list(set(extracted_texts))  # Remove duplicates

In [18]:
def process_images_with_yolo(model, image_dir, output_size=(800, 600)):
    """
    Process all images in a directory, detect ingredient lists, and extract text.
    """
    results = {}

    for file_name in os.listdir(image_dir):
        file_path = os.path.join(image_dir, file_name)
        image = cv2.imread(file_path)

        if image is None:
            continue

        predictions = model(file_path)
        detections = []

        for pred in predictions[0].boxes:
            bbox = pred.xyxy[0].tolist()
            confidence = float(pred.conf[0])
            cls = int(pred.cls[0])
            label = model.names[cls]

            if label == 'ingredients' and confidence > 0.5:
                detections.append({'bbox': bbox, 'confidence': confidence, 'class': label})

        if detections:
            extracted_texts = extract_ingredients_from_image(image, detections)
            results[file_name] = extracted_texts

            # Draw bounding boxes
            for detection in detections:
                x1, y1, x2, y2 = map(int, detection['bbox'])
                cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(image, detection['class'], (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            # Resize image
            resized_image = cv2.resize(image, output_size)
            output_path = os.path.join(image_dir, f"processed_{file_name}")
            cv2.imwrite(output_path, resized_image)

    return results

In [22]:
if __name__ == "__main__":
    # Load the trained YOLO model
    model = YOLO(r"F:\University\2_Year_02\2_Year_02_Sem1\0_Data_Science\Component_1_Nelishka\Yolo_11_x\runs\detect\train\weights\best.pt")

    # Directory containing images to be checked
    image_dir = r"C:\Users\nelis\Desktop\labels"

    # Process images and extract ingredient lists
    extracted_ingredients = process_images_with_yolo(model, image_dir)

    # Print structured ingredients
    for image_name, ingredients in extracted_ingredients.items():
        print(f"Image: {image_name}")
        print(ingredients)  # Outputs like ['flour', 'eggs', 'milk', 'butter']


image 1/1 C:\Users\nelis\Desktop\labels\in1057.jpg: 608x640 1 ingredients, 100.3ms
Speed: 9.8ms preprocess, 100.3ms inference, 5.4ms postprocess per image at shape (1, 3, 608, 640)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Image: in1057.jpg
['confectionisser', 'freshisser', 'carbox', 'crumbissers', 'sugar', 'gu', 'acid', 'vegetable', 'milkisser', 'milk', 'isser', 'coffee', 'diglycerides', 'dig', 'cellulose', 'fat', 'solid', 'selective', 'diglyceridesisser', 'sodiumisserrate', 'willingness', 'confection', 'carageisseran', 'per', 'crumbs', 'f', 'guar', 'vegetableisser', 'concentrate', 'fresh', 'solidisser', 'crumb', 'meth']
