In [None]:
# prompt: mount someone else drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!pip install keras_cv paddleocr metaphone rapidfuzz paddlepaddle-gpu albumentations python-dateutil google-generativeai

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import layers
from keras_cv import layers as layers_cv
from io import BytesIO
from PIL import Image
import numpy as np
import base64
from paddleocr import PaddleOCR
import traceback
import time
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import cv2
from metaphone import doublemetaphone
from rapidfuzz import process, fuzz
import spacy
import re
from datetime import datetime
from dateutil import parser
import google.generativeai as genai

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Segment model
segment_model = torch.hub.load('ultralytics/yolov5', 'custom', path='full_50_2_640.pt').to(device)

# Login to Huggingface
login(token=YOUR_HUGGINGFACE_TOKEN)

instruction = (
    "You are an expert in fuzzy string matching. Given a list of product names, your task is to identify "
    "the product name that most closely matches a specified input string. Perform a strict fuzzy match, "
    "ensuring to output only one product name from the list. If no clear match is found, return 'input string'. "
    "Your output should be a single line with no extra text, explanations, or formatting."
)

# Gemma Model
tokenizer = AutoTokenizer.from_pretrained("google/gemma-1.1-2b-it")
gemma = AutoModelForCausalLM.from_pretrained(
    "google/gemma-1.1-2b-it",
    torch_dtype=torch.float16,
    revision="float16"
).to(device)

# Load the pre-trained SpaCy model
nlp = spacy.load('en_core_web_sm')

# Setup Google Generative AI
genai.configure(api_key=YOUR_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash', system_instruction=instruction)

# Load the EfficientNetB2 model for freshness vs. rotten prediction
freshness_base_model = keras.applications.EfficientNetB2(
    include_top=False,
    weights="imagenet",
    input_shape=(260, 260, 3)
)
x = layers.Flatten()(freshness_base_model.output)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
predictions = layers.Dense(2, activation='softmax')(x)
freshness_model = keras.Model(inputs=freshness_base_model.input, outputs=predictions)

# # Load the weights for the freshness vs. rotten model
# freshness_model.load_weights("fresh_vs_rotten_v1a.weights.h5")

# # Load the product vs. rotten model
# model = keras.models.load_model('fresh_vs_rotten_product.keras')

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True, rec_algorithm="SVTR_LCNet", ocr_version='PP-OCRv4', use_space_char=True)

# Load the product list CSV
df = pd.read_csv("product_list.csv")

# Convert product names to lowercase and create the product list
product_list = df['Product Name'].str.lower().to_list()

# Define the word list (converted to lowercase for consistency)
word_list = [name.lower() for name in product_list] + ['amul malai paneer', 'borges durum wheat pasta', 'wheat pasta', 'fantastic bathroom']

# Precompute Metaphone codes for the word list
metaphone_codes = {word: doublemetaphone(word)[0] for word in word_list}

# Convert word list to set for exact matching
word_set = set(word_list)

def fuzzy_match(product_name):
    best_match, score, _ = process.extractOne(product_name, word_list, scorer=fuzz.WRatio)
    if score > 80:
        return best_match
    else:
        return "No Result found"  # Return as "No Result found"

def predict_product_name(ocr_text):
    # If the product name is more than 5 words, truncate it
    words = ocr_text.split()
    truncated_name = ' '.join(words[:5]) if len(words) > 5 else ocr_text

    # Use Generative AI to match the product name
    query = f"""
I have a list of product names: {word_list}. I want to find the product name from this list that most closely matches the following input: "{truncated_name}".

Your task is to perform a strict fuzzy match, ensuring the output is only one product name from the list. If no clear match is found, return "No Result found".

Output only the closest product name in a single line, with no extra text, explanations, or formatting. Do not include anything other than the product name itself or "No Result found" if there is no suitable match.
"""

    response = model.generate_content(query)
    return response.text.strip()

def calculate_overlap_percentage(xa1, xa2, ya1, ya2, xb1, xb2, yb1, yb2):
    x_overlap1 = max(xa1, xb1)
    y_overlap1 = max(ya1, yb1)
    x_overlap2 = min(xa2, xb2)
    y_overlap2 = min(ya2, yb2)
    if x_overlap1 < x_overlap2 and y_overlap1 < y_overlap2:
        overlap_area = (x_overlap2 - x_overlap1) * (y_overlap2 - y_overlap1)
    else:
        overlap_area = 0
    rect_b_area = (xb2 - xb1) * (yb2 - yb1)
    if rect_b_area == 0:
        return 0
    overlap_percentage = (overlap_area / rect_b_area) * 100
    return overlap_percentage

def predict_answer(ocr_text, question):
    if ocr_text == "":
        return "No text detected from the image."

    prompt = f"{question} Text: {ocr_text}. Please respond with ONLY the product name."
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = gemma.generate(
            input_ids=input_ids['input_ids'],
            max_length=500,
            num_return_sequences=1,
            do_sample=False,
        ).to(device)

    predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    product_name = predicted_answer.strip().split("\n")[-1]
    words = product_name.split()
    # Filter out words that contain any digit
    filtered_words = [word for word in words if not re.search(r'\d', word)]

    # Limit to the first 5 words
    if len(filtered_words) > 5:
        filtered_words = filtered_words[:5]
    if len(words) > 5:
        product_name = ' '.join(filtered_words)
    return product_name.strip()

# Step 1: Read the image file
image_path = 'imag1.jpg'
image2 = cv2.imread(image_path)

# Run object detection
results = segment_model(image2)
predictions = results.pred[0]
coordinates = []

# Extract bounding box coordinates
for *box, conf, cls in predictions:
    x1, y1, x2, y2 = map(int, box)
    coordinates.append((x1, x2, y1, y2))

correct = []
# Filter overlapping boxes
i=0
while i < len(coordinates):
    j=0
    while j < len(coordinates):
        if i != j:
            percentage = calculate_overlap_percentage(*coordinates[j], *coordinates[i])
            if percentage > 80:
                coordinates.pop(i)
                i-=1
                break
        j+=1
    i+=1

product_image_paths = []
grocery_image_paths=[]
# Save cropped images and predict freshness
for i, (x1, x2, y1, y2) in enumerate(coordinates):
    cropped_image = image2[y1:y2, x1:x2]
    cv2.imwrite(f'image_{i}.jpg', cropped_image)

    # Load and preprocess the cropped image for freshness prediction
    img = keras.utils.load_img(f'image_{i}.jpg', target_size=(260, 260))
    img_array = keras.utils.img_to_array(img)
    img_array = np.array([img_array])

    preds = model.predict(img_array)
    first_determination = 0
    print(f"Predictions for image {i}: {preds}, Determination: {first_determination}")

    if first_determination == 0:
        product_image_paths.append(f'image_{i}.jpg')
    else:
        grocery_image_paths.append(f'image_{i}.jpg')
# Initialize a dictionary to hold products and their expiry dates with counts
products_dict = {}

# Perform OCR on cropped images
for path in product_image_paths:
    text = ""
    result = ocr.ocr(path, cls=True)
    for idx in range(len(result)):
        res = result[idx]
        if res:
            # Extract the text from the OCR result
            for line in res:
                text += line[1][0] + " "  # line[1][0] contains the actual text
            print(text)
            # Now pass the extracted text to predict the product name
            product_name = predict_product_name(text.strip().lower())
            print(f"Predicted Product Name: {product_name}")
            # If the predicted product name is "Product Not found", apply fuzzy match
            if product_name == "No Result found":
                # Apply the fuzzy match function
                product_name = fuzzy_match(text.strip().lower())
                print(f"Fuzzy Matched Product Name: {product_name}")
    # Extract dates from the OCR text
    dates = []
    regex_patterns = [
        r'(?:exp(?:iry)?[: ]?)\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',  # Match dates after "exp" or "expiry"
        r'(?:due[: ]?)\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',          # Match dates after "due"
        r'(?:before\s?)\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',         # Match dates after "before"
        r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',                     # 25/12/2024 or 25-04-2024
        r'\b(\d{4}[.-]\d{1,2}[.-]\d{1,2})\b',                       # 2024.12.25
        r'\b(\d{1,2}[.-]\d{1,2}[.-]\d{4})\b',                       # 12.25.2024 or 25-04-2024
        r'\b(\w{3} \d{1,2}, \d{4})\b',                               # Dec 25, 2024
        r'\b(\d{1,2} \w{3,9} \d{4})\b'                               # 25 December 2024
    ]

    # Collect matches from regex patterns
    for pattern in regex_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        dates.extend(matches)

    # Extract dates using SpaCy
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            dates.append(ent.text)

    # Combine extracted dates and remove duplicates
    unique_dates = list(set(dates))  # Remove duplicates
    print(f"{1}:{unique_dates}")
    # Convert strings to datetime objects and find the maximum date
    max_date = None
    if unique_dates:
        date_objects = []
        for date_str in unique_dates:
            try:
                normalized_date = parser.parse(date_str)
                date_objects.append(normalized_date)
            except ValueError:
                print(f"Could not parse date: {date_str}")

        if date_objects:
            max_date = max(date_objects)

    # Update the products_dict with product name and expiry date
    if product_name == "No Result found":
        del products_dict[product_name]
    if product_name in products_dict:
        products_dict[product_name]['count'] += 1
        if max_date:
            # Compare and store the maximum expiry date
            existing_max_date = parser.parse(products_dict[product_name]['expiry_date'])
            if max_date > existing_max_date:
                products_dict[product_name]['expiry_date'] = max_date.strftime('%d-%m-%Y')
    else:
        # Check if any product in the dictionary has the same expiry date
        found_same_expiry = False
        if max_date:
            for existing_product, info in products_dict.items():
                if info['expiry_date'] == max_date.strftime('%d-%m-%Y'):
                    # Treat it as the same product if expiry dates match
                    products_dict[existing_product]['count'] += 1
                    found_same_expiry = True
                    break

        # If no product with the same expiry date was found, add the new product
        if not found_same_expiry:
            products_dict[product_name] = {
                'count': 1,
                'expiry_date': max_date.strftime('%d-%m-%Y') if max_date else "No valid date found"
            }


# Display the results
for product, info in products_dict.items():
    print(f"Product: {product}, Count: {info['count']}, Expiry Date: {info['expiry_date']}")

for path in grocery_image_paths:
      img = keras.utils.load_img(path, target_size=(260, 260))
      img_arr = keras.utils.img_to_array(img)
      img_arr = np.array([img_arr])
      preds = model.predict(img_arr)
      print(preds)
      print(np.argmax(preds))
      freshness_index = preds[0][1] * 100
      print(f"Freshness Index : {freshness_index}")
