In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the processed dataset
file_path = "/content/finalized_.csv"  # Ensure the correct path in Colab
df = pd.read_csv(file_path)

# Check class distribution
print(df['label'].value_counts())

# Train-test split (80-20)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Load tokenizer (BERT Base Uncased)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Convert texts to datasets (Hugging Face `datasets`)
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Tokenize datasets using Hugging Face’s `datasets` for efficiency
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert datasets into PyTorch format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Define data collator for efficient dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


In [None]:
# Training arguments (Optimized for Speed)
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=4,
    per_device_train_batch_size=16,  # Reduce batch size if OOM
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./bert_logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision for faster training
    gradient_checkpointing=True,  # Reduce memory consumption
    report_to="none",  # Disable W&B logging (if not needed)
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

# Print classification report
print(classification_report(test_labels, predicted_labels, target_names=["Non-Cosmetic", "Cosmetic"]))

# Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")
print("Fine-tuned BERT model and tokenizer saved successfully!")

# Load model for inference
model = BertForSequenceClassification.from_pretrained("fine_tuned_bert___")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_bert___")

In [5]:
from transformers import BertForSequenceClassification, AutoTokenizer
import torch
import pytesseract
from PIL import Image, ImageFile

# Load the fine-tuned model and tokenizer
model_path = r"C:\Users\LENOVO\Desktop\fine_tuned_bert___"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Adjustable threshold for classification
THRESHOLD = 0.90  # Reduced for better sensitivity

# Keywords that indicate ingredient listing starts
KEYWORDS = {"ingredient", "ingredients", "content", "component", "composition"}

def classify_ingredient(ingredient):
    """Classifies an ingredient with an adjustable confidence threshold."""
    inputs = tokenizer(ingredient, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    with torch.no_grad():  # Disable gradient calculations (faster inference)
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

    # Assign label based on confidence
    if probs[1] > THRESHOLD:  
        return "Cosmetic", probs  # Classified as "Cosmetic"
    elif probs[0] > THRESHOLD:  
        return "Non-Cosmetic", probs  # Classified as "Non-Cosmetic"
    else:
        return "Uncertain", probs  # If confidence is too low

def classify_ingredients_from_image(image_path):
    """Extracts text from an image and filters only Cosmetic-labeled ingredients that contain ','."""
    ImageFile.LOAD_TRUNCATED_IMAGES = True

    # Extract text using OCR (optimize for better text extraction)
    image = Image.open(image_path).convert("L")  # Convert to grayscale for better OCR
    extracted_text = pytesseract.image_to_string(image, config="--psm 6")  # Paragraph segmentation mode 6

    # Process OCR text line by line
    lines = extracted_text.split("\n")
    found_keyword = False  # Track when to start classification
    filtered_results = []

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check if this line contains a keyword
        if any(keyword in line.lower() for keyword in KEYWORDS):
            print(f"Keyword Line Detected: {line} (Automatically Classified as Cosmetic)\n")
            found_keyword = True
            continue

        if found_keyword:  # Start classification for ingredients listed below
            category, probabilities = classify_ingredient(line)
            
            # Filter only Cosmetic-labeled ingredients containing ","
            if category == "Cosmetic" and "," in line:
                # Split the ingredient list by comma and add each ingredient to the results
                ingredients = [ingredient.strip() for ingredient in line.split(",")]
                
                # Add each ingredient to the filtered results
                for ingredient in ingredients:
                    filtered_results.append((ingredient, category, probabilities))
                    
                    # Display results
                    print(f"Ingredient: {ingredient}")
                    print(f"Predicted Category: {category}")
                    print(f"Probabilities (Non-Cosmetic, Cosmetic): {probabilities}\n")

    if not filtered_results:
        print("❌ No cosmetic ingredients with ',' found.")

    return filtered_results


In [8]:
# Example usage
image_path = r"C:\Users\LENOVO\Desktop\DSGP_\New folder\Untitled26.jpg"
classify_ingredients_from_image(image_path)

Keyword Line Detected: INGREDIENTS- HAIR COLOUR 1 , (Automatically Classified as Cosmetic)

Ingredient: Aqua
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [5.7383673e-04 9.9942625e-01]

Ingredient: Hydrolyzed Keratin
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [5.7383673e-04 9.9942625e-01]

Ingredient: Cetyl Alcohol
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [5.7383673e-04 9.9942625e-01]

Ingredient: | °
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [5.7383673e-04 9.9942625e-01]

Ingredient: Glycol
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [0.00126571 0.9987343 ]

Ingredient: Dimethicone
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [0.00126571 0.9987343 ]

Ingredient: Ammonium 5
Predicted Category: Cosmetic
Probabilities (Non-Cosmetic, Cosmetic): [0.00126571 0.9987343 ]

Ingredient: m-aminophenol
Predicted Category: Cosmetic
Probabilities (

[('Aqua', 'Cosmetic', array([5.7383673e-04, 9.9942625e-01], dtype=float32)),
 ('Hydrolyzed Keratin',
  'Cosmetic',
  array([5.7383673e-04, 9.9942625e-01], dtype=float32)),
 ('Cetyl Alcohol',
  'Cosmetic',
  array([5.7383673e-04, 9.9942625e-01], dtype=float32)),
 ('| °', 'Cosmetic', array([5.7383673e-04, 9.9942625e-01], dtype=float32)),
 ('Glycol', 'Cosmetic', array([0.00126571, 0.9987343 ], dtype=float32)),
 ('Dimethicone', 'Cosmetic', array([0.00126571, 0.9987343 ], dtype=float32)),
 ('Ammonium 5', 'Cosmetic', array([0.00126571, 0.9987343 ], dtype=float32)),
 ('m-aminophenol',
  'Cosmetic',
  array([5.037346e-04, 9.994962e-01], dtype=float32)),
 ('Ascorbic acid',
  'Cosmetic',
  array([5.037346e-04, 9.994962e-01], dtype=float32)),
 ('Disodium}. §',
  'Cosmetic',
  array([5.037346e-04, 9.994962e-01], dtype=float32)),
 ('EDTA', 'Cosmetic', array([5.102015e-04, 9.994898e-01], dtype=float32)),
 ('2', 'Cosmetic', array([5.102015e-04, 9.994898e-01], dtype=float32)),
 ('4-Diaminophenoxyethan