In [183]:
import cv2
import pytesseract
import numpy as np
import re
import torch
from transformers import BertForSequenceClassification, AutoTokenizer
from PIL import Image, ImageFile

# Load fine-tuned BERT models and tokenizers
model_path_1 = r"C:\Users\LENOVO\Desktop\fine_tuned_bert_____"
model_path_2 = r"C:\Users\LENOVO\Desktop\fine_tuned_bert___"

model_1 = BertForSequenceClassification.from_pretrained(model_path_1)
tokenizer_1 = AutoTokenizer.from_pretrained(model_path_1)

model_2 = BertForSequenceClassification.from_pretrained(model_path_2)
tokenizer_2 = AutoTokenizer.from_pretrained(model_path_2)

# Keywords to look for in the extracted text
KEYWORDS = ["key ingredients", "key ingredient", "ingredient", "ingredients", "content", "component", "composition"]

# Function to preprocess image for OCR
def preprocess_image_ocr(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    denoised = cv2.medianBlur(gray, 3)
    
    # Apply CLAHE for better contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(100, 100))
    enhanced = clahe.apply(denoised)
    
    return enhanced

# Function to clean extracted text
import re

def clean_extracted_text(text):
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        # Remove unwanted characters (like \, |, etc.) but preserve numbers, hyphens, and slashes
        line = re.sub(r"[^a-zA-Z0-9\s,\-%/]", "", line)

        # Normalize spaces
        line = re.sub(r"\s+", " ", line).strip()

        # Split the line into words
        words = line.split()

        # Remove 1 or 2-letter words from the left corner (only if they are alphabetic)
        if len(words) > 0 and len(words[0]) <= 2 and words[0].isalpha():
            words = words[1:]

        # Remove 1 or 2-letter words from the right corner (only if they are alphabetic)
        if len(words) > 0 and len(words[-1]) <= 2 and words[-1].isalpha():
            words = words[:-1]

        # Reconstruct the line
        cleaned_line = " ".join(words)

        if cleaned_line:
            cleaned_lines.append(cleaned_line)

    return cleaned_lines  # Return list of cleaned ingredient lines

# Function to classify an ingredient using both models
def is_cosmetic_ingredient(ingredient):
    # Tokenize input for both models
    inputs_1 = tokenizer_1(ingredient, truncation=True, padding=True, max_length=128, return_tensors="pt")
    inputs_2 = tokenizer_2(ingredient, truncation=True, padding=True, max_length=128, return_tensors="pt")

    # Get predictions from both models
    with torch.no_grad():
        outputs_1 = model_1(**inputs_1)
        outputs_2 = model_2(**inputs_2)

    # Apply softmax to get probabilities
    probs_1 = torch.nn.functional.softmax(outputs_1.logits, dim=-1).numpy().astype(np.float32)
    probs_2 = torch.nn.functional.softmax(outputs_2.logits, dim=-1).numpy().astype(np.float32)

    # Average the probabilities from both models
    avg_probs = (probs_1 + probs_2) / 2.0

    # Get the predicted label (1 for cosmetic, 0 for non-cosmetic)
    predicted_label = np.argmax(avg_probs, axis=1)[0]

    return predicted_label == 1  # Return True if classified as a cosmetic ingredient

# Function to extract and filter cosmetic ingredients
def extract_cosmetic_ingredients(image_path):
    ImageFile.LOAD_TRUNCATED_IMAGES = True  # Prevents image truncation issues

    try:
        # Preprocess image for OCR
        processed_image = preprocess_image_ocr(image_path)

        # Perform OCR
        custom_config = r'--oem 3 --psm 6'  # Best for dense text
        extracted_text = pytesseract.image_to_string(processed_image, config=custom_config)

        # Clean extracted text
        ingredient_lines = clean_extracted_text(extracted_text)

        if not ingredient_lines:
            return []  # No output, just return an empty list if no ingredients found

        # Find the index of the keyword in the extracted text
        keyword_index = -1
        for i, line in enumerate(ingredient_lines):
            if any(keyword.lower() in line.lower() for keyword in KEYWORDS):
                keyword_index = i
                break

        # If a keyword is found, extract lines below it (including the keyword line)
        if keyword_index != -1:
            # Include the keyword line and all lines below it
            relevant_lines = ingredient_lines[keyword_index:]
        else:
            # If no keyword is found, process all lines
            relevant_lines = ingredient_lines

        # Filter only cosmetic ingredients from the relevant lines
        cosmetic_ingredients = [line for line in relevant_lines if is_cosmetic_ingredient(line)]

        return cosmetic_ingredients

    except Exception:
        return []  # In case of any error, return an empty list silently

In [202]:
# Example usage
image_path = r"C:\Users\LENOVO\Desktop\DSGP_\New folder\Untitled32.jpg"
cosmetic_ingredients = extract_cosmetic_ingredients(image_path)

# Display the extracted ingredients
if cosmetic_ingredients:
    print("Extracted Cosmetic Ingredients:")
    for ingredient in cosmetic_ingredients:
        print(ingredient)
else:
    print("No cosmetic ingredients found.")


Extracted Cosmetic Ingredients:
Dimethicone, Dimethicone and Cyclopentasiloxane, ae
Cydopentasiloxane, Isopropy Myristate, Phenyl
- Trimethicone, Argania Spinosa Kernal Oil, Tocopheryl
Acetate, Octylmethoxycinamate, Pertume


In [203]:
cosmetic_ingredients

['Dimethicone, Dimethicone and Cyclopentasiloxane, ae',
 'Cydopentasiloxane, Isopropy Myristate, Phenyl',
 '- Trimethicone, Argania Spinosa Kernal Oil, Tocopheryl',
 'Acetate, Octylmethoxycinamate, Pertume']

In [204]:
# Merging lines correctly
processed_ingredients = []
temp = ""

for line in cosmetic_ingredients:
    line = line.strip()
    if not line.endswith(","):  # If line does not end with a comma, merge with the next
        temp += " " + line
    else:
        temp += " " + line
        processed_ingredients.append(temp.strip())  # Add cleaned ingredient
        temp = ""  # Reset temp

# If anything is left in temp, add it
if temp:
    processed_ingredients.append(temp.strip())

# Splitting ingredients correctly and removing empty components
cleaned_ingredients = []
for line in processed_ingredients:
    parts = [ingredient.strip() for ingredient in line.split(',') if ingredient.strip()]  # Remove empty components
    cleaned_ingredients.extend(parts)

# Printing ingredients correctly separated by commas
print(", ".join(cleaned_ingredients))

Dimethicone, Dimethicone and Cyclopentasiloxane, ae Cydopentasiloxane, Isopropy Myristate, Phenyl - Trimethicone, Argania Spinosa Kernal Oil, Tocopheryl Acetate, Octylmethoxycinamate, Pertume


In [205]:
for i in cleaned_ingredients:
    print(i)

Dimethicone
Dimethicone and Cyclopentasiloxane
ae Cydopentasiloxane
Isopropy Myristate
Phenyl - Trimethicone
Argania Spinosa Kernal Oil
Tocopheryl Acetate
Octylmethoxycinamate
Pertume
