In [7]:
import os
import cv2
import pytesseract
import pandas as pd


# Set Tesseract Path (adjust this path based on your system)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [8]:
# Define your dataset folder
dataset_folder = "C:\\Users\\LENOVO\\Desktop\\only labels"

# Load image file paths
image_files = [os.path.join(dataset_folder, f) for f in os.listdir(dataset_folder) if f.endswith(('.jpg', '.png', '.jpeg'))]

# Create labels (all images are cosmetic-related = 1)
labels = [1] * len(image_files)

# Store image paths and labels in a DataFrame
data = pd.DataFrame({"image_path": image_files, "label": labels})

print(f"Number of images found: {len(image_files)}")



Number of images found: 377


In [9]:
def preprocess_image(image_path):
    # Read image in grayscale
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Resize image for better OCR compatibility
    img = cv2.resize(img, (800, 800))
    
    # Apply Gaussian Blur
    img = cv2.GaussianBlur(img, (5, 5), 0)
    
    # Apply Otsu's Thresholding
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    
    return img


In [10]:
def extract_text_from_image(image_path):
    # Preprocess the image
    img = preprocess_image(image_path)
    
    # Extract text using Tesseract OCR
    extracted_text = pytesseract.image_to_string(img, lang='eng')
    
    return extracted_text

# Apply OCR to extract text from all images
data['extracted_text'] = data['image_path'].apply(extract_text_from_image)

# Print the first few rows to verify
print(data.head())

# Remove rows with empty text
data = data[data['extracted_text'].str.strip() != ""]
if data.empty:
    raise ValueError("No valid text was extracted. Check your OCR or image quality.")


                                          image_path  label  \
0  C:\Users\LENOVO\Desktop\only labels\20241103_1...      1   
1  C:\Users\LENOVO\Desktop\only labels\20241103_1...      1   
2  C:\Users\LENOVO\Desktop\only labels\20241103_1...      1   
3  C:\Users\LENOVO\Desktop\only labels\202411203_...      1   
4  C:\Users\LENOVO\Desktop\only labels\202421103_...      1   

                                      extracted_text  
0  Ingredients\nAya Papaya (Caricg Papaya) Extrac...  
1  Ingredients\nAqua, Bee Honey (Apis mellifera) ...  
2  NGHEDIENTS: NATER (AQUA) GLYCERIN, STEARIC ACH...  
3  Ingredients\n\n% Srabery (FragariaL,) Extract ...  
4  nd\n\nNRDENTS. WATER (AQUA, STEARIC ACID, CLYC...  


In [20]:
data = data.drop(columns=["label", "image_path"], errors="ignore")


In [31]:
print(data)

                                        extracted_text  \
0    Ingredients\nAya Papaya (Caricg Papaya) Extrac...   
1    Ingredients\nAqua, Bee Honey (Apis mellifera) ...   
2    NGHEDIENTS: NATER (AQUA) GLYCERIN, STEARIC ACH...   
3    Ingredients\n\n% Srabery (FragariaL,) Extract ...   
4    nd\n\nNRDENTS. WATER (AQUA, STEARIC ACID, CLYC...   
..                                                 ...   
370  inh hngredionts Aqua, Hibiscus (Hibiscus 0 oe\...   
371                                            “yt\n\n   
372             EDITS\n\nMeta peta Cos rc,\nPons tus\n   
373  Wp yin ant Si ae\nHi aru Patt, Sin Ce\nef Ge C...   
374  Se lot"\n= Na hn ayy\n\neit?\nhaat (lt Th Bd n...   

                                          cleaned_text  
0    ingredients aya papaya caricg papaya extract s...  
1    ingredients aqua, bee honey apis mellifera ext...  
2    nghedients nater aqua glycerin, stearic ach mr...  
3    ingredients  srabery fragarial, extract soo eh...  
4    nd nrdents wa

In [37]:

import re



# Preprocess text (clean and tokenize)
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()  # Remove punctuation and lowercase
    return set(text.split())

data['cleaned_tokens'] = data['cleaned_text'].apply(preprocess_text)

# Function to calculate similarity
def calculate_similarity(input_tokens, dataset_tokens):
    if not input_tokens:  # Handle empty input tokens
        return 0  # Return zero similarity if input tokens are empty
    matches = [len(input_tokens.intersection(tokens)) for tokens in dataset_tokens]
    return max(matches) / len(input_tokens)

# Input image preprocessing
def extract_text_from_image(image_path):
    # Use Tesseract OCR to extract text
    import pytesseract
    text = pytesseract.image_to_string(image_path)
    print("OCR Output:", text)  # Debug: Check OCR output
    return preprocess_text(text)

# Determine if input is cosmetic
def is_cosmetic(image_path, data):
    input_tokens = extract_text_from_image(image_path)
    if not input_tokens:  # Handle case where no tokens are extracted
        print("No valid tokens found in the image text.")  # Debug message
        return False  # Default to non-cosmetic
    similarity = calculate_similarity(input_tokens, data['cleaned_tokens'])
    return similarity > 0.5  # Adjust the threshold as needed

# Test the function
image_path = "C:\\Users\\LENOVO\\Desktop\\only labels\\20241103_12746228.png"
result = is_cosmetic(image_path, data)
print("Cosmetic" if result else "Other")


OCR Output: Ingredients
Aqua, Papaya (Carica Papaya) Extract, Sodium
Laureth Sulphate, Vitamin E , Coco Betaine,
Sodium Cocoyl Isethionate, Sodium Benzoate
EDTA, Frgarnce

Cosmetic
