In [None]:
# @title 1A. Install Required Libraries
!pip install -q ultralytics roboflow pandas numpy rapidfuzz sentence-transformers google-generativeai Pillow unidecode
print("✅ All libraries installed successfully.")
print("✅ YOLOv8 and Roboflow libraries installed.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.0/1.1 MB[0m [31m32.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# @title 2. Import Libraries and Modules
import os
import re
import pandas as pd
import numpy as np
from unidecode import unidecode
from google.colab import files, userdata, drive # <-- Added 'drive'

# Gemini and Image Processing
from PIL import Image
import google.generativeai as genai

# Matching and Embeddings
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer, util

# Logo Detection
from ultralytics import YOLO # <-- ADD THIS LINE

import torch

print("✅ All modules imported.")

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
✅ All modules imported.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title 3. Configure API Key and Mount Drive
# This fetches the key you stored in Colab's "Secrets" manager.
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=GEMINI_API_KEY)
    print("✅ Gemini API configured successfully.")
except Exception as e:
    print("❌ Could not configure Gemini API. Please ensure you have set the 'GEMINI_API_KEY' in the '🔑' (Secrets) tab on the left.")


✅ Gemini API configured successfully.


In [None]:
# @title 5. Define NEW Knowledge Base (KB) Preparation
# This new function is simplified for your phrase-based CSV.

PARENS_RE = re.compile(r"\([^)]*\)")
BRACKETS_RE = re.compile(r"\[[^\]]*\]")
EXTRA_SPACES_RE = re.compile(r"\s+")

def normalize_status(x: str) -> str:
    if not isinstance(x, str): return "unknown"
    x = x.strip().lower()
    if x in ["halal", "h", "permitted", "allowed", "lawful"]: return "halal"
    if x in ["haram", "forbidden", "prohibited", "not halal", "non-halal", "not permissible"]: return "haram"
    if x in ["mushbooh", "doubtful", "unknown", "uncertain", "vary", "depends"]: return "mushbooh"
    return "unknown"

def normalize_text_for_matching(s: str) -> str:
    """A consistent normalization function for both KB and OCR text."""
    if not isinstance(s, str): return ""
    s = unidecode(s).lower()
    s = PARENS_RE.sub(" ", s)
    s = BRACKETS_RE.sub(" ", s)
    s = s.replace("*", " ")
    s = re.sub(r"[®™.,:;!?]", " ", s)
    s = s.replace("&", " and ")
    return EXTRA_SPACES_RE.sub(" ", s).strip()

def prepare_phrase_kb(csv_path: str):
    """Prepares the Knowledge Base from the new CSV format."""
    df = pd.read_csv(csv_path)
    # Dynamically find 'text' and 'label' columns
    cols = {c.lower().strip(): c for c in df.columns}
    text_col = cols.get('text') or list(df.columns)[0]
    label_col = cols.get('label') or list(df.columns)[1]

    # Create a new DataFrame for the KB
    kb_df = pd.DataFrame()
    kb_df['original_text'] = df[text_col]
    kb_df['status'] = df[label_col].apply(normalize_status)
    # Create a normalized version for semantic matching
    kb_df['norm_text'] = df[text_col].apply(normalize_text_for_matching)

    # Drop any rows where text is empty after normalization
    kb_df.dropna(subset=['norm_text'], inplace=True)
    kb_df = kb_df[kb_df['norm_text'] != '']

    return kb_df

print("✅ NEW phrase-based KB preparation functions defined.")

✅ NEW phrase-based KB preparation functions defined.


In [None]:
# @title 6. Build/Load Knowledge Base and Embeddings

# --- 1. Define Save Paths ---
# We'll save the processed DataFrame and the computed Tensors
KB_SAVE_DIR = "/content/drive/MyDrive/HalalKB"
DF_PATH = os.path.join(KB_SAVE_DIR, "kb_dataframe.pkl")
EMB_PATH = os.path.join(KB_SAVE_DIR, "kb_embeddings.pt")

# --- 2. Load the Embedding Model (Always needed for queries later) ---
print("\nLoading sentence-transformer model...")
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL_NAME)
print("✅ Model loaded.")

# --- 3. Check if pre-computed files exist ---
if os.path.exists(DF_PATH) and os.path.exists(EMB_PATH):
    print(f"\nFound pre-computed files in {KB_SAVE_DIR}.")
    print("Loading KB DataFrame and Embeddings from Google Drive...")

    # Load the files directly from your Drive
    kb_df = pd.read_pickle(DF_PATH)
    KB_PHRASE_EMB = torch.load(EMB_PATH)

    print(f"✅ KB and Embeddings loaded. Total phrases: {len(kb_df)}")

else:
    print(f"\nNo pre-computed files found. Building from scratch...")

    # --- 4a. Run Original KB Preparation ---
    print("Building Phrase-based Knowledge Base from CSV...")
    kb_df = prepare_phrase_kb(CSV_PATH) # This function is from Cell 5
    print(f"✅ KB built successfully. Total phrases: {len(kb_df)}")
    print("KB Head:")
    print(kb_df.head())

    # Get the list of normalized phrases to be embedded
    KB_PHRASES = kb_df['norm_text'].tolist()

    # --- 4b. Run Original Embedding Computation ---
    print("\nComputing embeddings for all KB phrases... (This may take a moment)")
    KB_PHRASE_EMB = embedder.encode(KB_PHRASES, convert_to_tensor=True, normalize_embeddings=True)
    print("✅ Phrase embeddings computed.")

    # --- 4c. Save the new files to Google Drive ---
    print(f"\nSaving computed files to {KB_SAVE_DIR} for next time...")
    os.makedirs(KB_SAVE_DIR, exist_ok=True)

    # Save the DataFrame as a pickle file and the tensor with torch.save
    kb_df.to_pickle(DF_PATH)
    torch.save(KB_PHRASE_EMB, EMB_PATH)

    print(f"✅ Saved kb_dataframe.pkl and kb_embeddings.pt.")

print("\n✅ Knowledge Base and Embeddings are ready.")


Loading sentence-transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded.

Found pre-computed files in /content/drive/MyDrive/HalalKB.
Loading KB DataFrame and Embeddings from Google Drive...
✅ KB and Embeddings loaded. Total phrases: 39787

✅ Knowledge Base and Embeddings are ready.


In [None]:
# @title 7. Define OCR and Parsing Functions (Using Gemini)
# This replaces the old easyocr and split_ingredients functions.

def extract_text_from_image(image_path: str) -> str:
    """Uses the Gemini Vision model to perform OCR on a local image file."""
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"The file '{image_path}' was not found.")
    print(f"👁️ Processing image with Gemini: {image_path}")
    img = Image.open(image_path)
    model = genai.GenerativeModel("models/gemini-2.5-flash") # or "models/gemini-1.5-flash-latest"
    prompt = "Extract all text from this image exactly as it appears. Preserve line breaks and original formatting."
    try:
        response = model.generate_content([prompt, img])
        return response.text.strip()
    except Exception as e:
        print(f"An error occurred while calling the Gemini API: {e}")
        raise

def find_ingredients_block(full_text: str) -> str | None:
    """Uses regex to find the complete ingredients section in a block of text."""
    pattern = re.compile(r"ingredients.*?\.", re.IGNORECASE | re.DOTALL)
    match = pattern.search(full_text)
    return match.group(0).strip() if match else None

def parse_ingredients_list(ingredients_block: str) -> list[str]:
    """Cleans and parses a raw ingredients text block into a clean list of individual ingredients."""
    if not ingredients_block: return []
    flat_text = ingredients_block.replace('\n', ' ')
    cleaned_text = re.sub(r'^ingredients.*?(?::|\))\s*', '', flat_text, flags=re.IGNORECASE | re.DOTALL)
    cleaned_text = cleaned_text.strip().removesuffix('.')
    ingredients = []
    current_ingredient, paren_level = "", 0
    for char in cleaned_text:
        if char == '(': paren_level += 1
        elif char == ')': paren_level = max(0, paren_level - 1)
        if char == ',' and paren_level == 0:
            ingredients.append(current_ingredient.strip())
            current_ingredient = ""
        else:
            current_ingredient += char
    if current_ingredient: ingredients.append(current_ingredient.strip())
    return [ing for ing in ingredients if ing]

print("✅ Gemini-based OCR and parsing functions defined.")

✅ Gemini-based OCR and parsing functions defined.


In [None]:
# @title 8. Define NEW Classification Pipeline (Whole-List Matching)

def classify_product_by_semantic_match(ingredient_list: list, semantic_threshold=0.70):
    """
    Joins the ingredient list into a single string and finds the best semantic match
    in the entire Knowledge Base.
    """
    if not ingredient_list:
        print("⚠️ Ingredient list is empty. Cannot classify.")
        return {"final_verdict": "Unknown", "reason": "No ingredients found."}

    # 1. Join the list of ingredients into a single string, just like the KB
    query_string = " ".join(ingredient_list)

    # 2. Normalize this query string in the exact same way as the KB
    normalized_query = normalize_text_for_matching(query_string)
    print(f"\n--- Matching against Normalized Query ---")
    print(normalized_query)
    print("-" * 35)

    # 3. Compute the embedding for the query string
    query_emb = embedder.encode([normalized_query], convert_to_tensor=True, normalize_embeddings=True)

    # 4. Find the best match in the KB using cosine similarity
    cos_scores = util.cos_sim(query_emb, KB_PHRASE_EMB)[0]
    scores_np = cos_scores.cpu().numpy()
    best_match_index = np.argmax(scores_np)
    best_match_score = scores_np[best_match_index]

    # 5. Get the results from the KB DataFrame
    matched_row = kb_df.iloc[best_match_index]
    matched_text = matched_row['original_text']
    matched_status = matched_row['status']

    # 6. Determine the final verdict
    print("\n--- Semantic Match Result ---")
    print(f"Best Match Score: {best_match_score:.4f}")
    print(f"Best Match from KB: '{matched_text}'")
    print(f"Status of Best Match: {matched_status.upper()}")

    if best_match_score >= semantic_threshold:
        final_verdict = matched_status
        reason = f"High similarity ({best_match_score:.2f}) to a known '{matched_status}' product."
    else:
        final_verdict = "Doubtful"
        reason = f"Low similarity score ({best_match_score:.2f}). Closest match was '{matched_status}', but confidence is below threshold."

    print(f"\n✅ Final Verdict: {final_verdict.upper()}")

    return {
        "final_verdict": final_verdict,
        "best_match_score": best_match_score,
        "matched_kb_text": matched_text,
        "matched_kb_status": matched_status,
        "reason": reason
    }


def analyze_image_for_halal_status(image_path):
    """The main end-to-end function using the new whole-list matching logic."""
    try:
        full_text = extract_text_from_image(image_path)
        print("\n--- Full Text Extracted by Gemini ---\n", full_text)

        ingredients_block = find_ingredients_block(full_text)
        if not ingredients_block:
            print("\n❌ ERROR: Could not find an 'Ingredients...' block in the image text.")
            return None

        print("\n--- Found Ingredients Block ---\n", ingredients_block)

        ingredient_list = parse_ingredients_list(ingredients_block)
        if not ingredient_list:
            print("\n❌ ERROR: Failed to parse any ingredients from the block.")
            return None

        print("\n--- Parsed Ingredient List ---")
        for i, item in enumerate(ingredient_list, 1):
            print(f"{i}. {item}")

        # This now calls the new classification function
        return classify_product_by_semantic_match(ingredient_list)

    except Exception as e:
        print(f"\n❌ An unexpected error occurred during analysis: {e}")
        return None

print("✅ NEW whole-list classification pipeline defined.")

✅ NEW whole-list classification pipeline defined.


In [None]:
# @title 8.5. Load Trained Halal Logo Detector
from ultralytics import YOLO
import os

# ⬇️ IMPORTANT: Update this path to where your 'best.pt' file is saved
# (It should match the path from the end of the training step)
SAVED_MODEL_PATH = "/content/drive/MyDrive/HalalLogoDetector/train_run_1/weights/best.pt"

if os.path.exists(SAVED_MODEL_PATH):
    print(f"Loading logo detector model from: {SAVED_MODEL_PATH}")
    logo_model = YOLO(SAVED_MODEL_PATH)
    print("✅ Halal logo detector model loaded successfully.")
else:
    print(f"❌ ERROR: Model file not found at '{SAVED_MODEL_PATH}'")
    print("Please train the model (Part 1) or check the file path.")
    logo_model = None

Loading logo detector model from: /content/drive/MyDrive/HalalLogoDetector/train_run_1/weights/best.pt
✅ Halal logo detector model loaded successfully.


In [None]:
  # @title 9. ▶️ Run Full Analysis (Logo Check First)

# --- This is the new logo-checking function ---
def check_for_halal_logo(image_path, model, confidence_threshold=0.5):
    """
    Runs the YOLO model on the image and returns True if a logo is found
    with high confidence, False otherwise.
    """
    if model is None:
        print("⚠️ Logo model is not loaded. Skipping logo check.")
        return False

    print(f"🔎 Scanning for Halal logo in: {image_path}")
    results = model(image_path, verbose=False) # 'verbose=False' silences a lot of text

    # Get the results for the first image
    result = results[0]

    # Check if any boxes (detections) have confidence > threshold
    for box in result.boxes:
        if box.conf[0] > confidence_threshold:
            print(f"✅ FOUND HALAL LOGO with {box.conf[0]:.2f} confidence!")
            return True

    # If loop finishes without returning, no high-confidence logo was found
    print("⚠️ No certified Halal logo found.")
    return False

# --- This is the main execution block ---
print("Please upload the food label image you want to analyze.")
uploaded_image = files.upload()
image_to_analyze_path = list(uploaded_image.keys())[0]

print("\n" + "="*50)
print(f"🚀 Starting Full Analysis for: {image_to_analyze_path}")
print("="*50)

# ==========================================================
# STEP 1: CHECK FOR HALAL LOGO
# ==========================================================
logo_found = check_for_halal_logo(
    image_to_analyze_path,
    logo_model,
    confidence_threshold=0.5 # You can adjust this (0.0 to 1.0)
)

if logo_found:
    print("\n\n---  Final Analysis Summary ---")
    print("Final Verdict: HALAL")
    print("Reason: A certified Halal logo was detected on the packaging.")
    print("--- --- ---")
else:
    # ==========================================================
    # STEP 2: NO LOGO FOUND, PROCEED WITH TEXT ANALYSIS
    # (This is your original code from Cell 9)
    # ==========================================================
    print("\nProceeding with ingredient text analysis...")

    analysis_result = analyze_image_for_halal_status(
        image_to_analyze_path
    )

    if analysis_result:
        print("\n\n---  Final Analysis Summary ---")
        print(f"Final Verdict: {analysis_result['final_verdict'].upper()}")
        print(f"Reason: {analysis_result['reason']}")
        print("--- --- ---")
    else:
        print("\n\n---  Final Analysis Summary ---")
        print("Final Verdict: UNKNOWN")
        print("Reason: Logo check failed and text analysis could not be completed.")
        print("--- --- ---")

Please upload the food label image you want to analyze.


Saving png-transparent-junk-food-martin-s-potato-chips-flavor-seasoning-box-barbecue-food-sweetness.png to png-transparent-junk-food-martin-s-potato-chips-flavor-seasoning-box-barbecue-food-sweetness.png

🚀 Starting Full Analysis for: png-transparent-junk-food-martin-s-potato-chips-flavor-seasoning-box-barbecue-food-sweetness.png
🔎 Scanning for Halal logo in: png-transparent-junk-food-martin-s-potato-chips-flavor-seasoning-box-barbecue-food-sweetness.png
⚠️ No certified Halal logo found.

Proceeding with ingredient text analysis...
👁️ Processing image with Gemini: png-transparent-junk-food-martin-s-potato-chips-flavor-seasoning-box-barbecue-food-sweetness.png

--- Full Text Extracted by Gemini ---
 MARTIN'S
0
Trans Fat
Guaranteed
Fresh!

Nutrition Facts
Serving Size 1 oz. (28g/About 10 chips)
Servings Per Container about 10
Amount Per Serving
Calories 150 Calories from Fat 80
                                              % Daily Value*
Total Fat 9g                                      