In [75]:
import easyocr
import fitz  # PyMuPDF
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt
from groq import Groq
import os
from dotenv import load_dotenv


from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders.csv_loader import CSVLoader


from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_ollama.llms import OllamaLLM
from typing import Optional

In [73]:
# Define the paths for the directories
directories = [
    "../outputs/extracted_images",
    "../outputs/extracted_text",
    "../outputs/extracted_products"
]

# Create the directories if they do not exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

Directory already exists: ../outputs/extracted_images
Directory already exists: ../outputs/extracted_text
Directory already exists: ../outputs/extracted_products


In [74]:

load_dotenv()

## load the Groq API key
groq_api_key=os.environ['GROQ_API_KEY']
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")


In [None]:
def convert_pdf_to_images(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Iterate through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        
        # Get the page's pixmap (image)
        pix = page.get_pixmap()
        
        # Convert the pixmap to a PIL Image
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Save the image
        image_path = f"{output_folder}/page_{page_num + 1}.png"
        image.save(image_path)
        print(f"Saved page {page_num + 1} as {image_path}")

    print("All pages converted to images.")

In [None]:
def overlay_extracted_text_on_image(image_path, extract_info):

    # Load the image using OpenCV
    image = cv2.imread(img_path)

    # Iterate through the extracted information
    for el in extract_info:
        # el[0] contains the bounding box coordinates
        # el[1] contains the text
        # el[2] contains the confidence score
        bbox = el[0]
        text = el[1]
        confidence = el[2]
        
        # Convert the coordinates from list of tuples to integer
        bbox = [(int(x), int(y)) for (x, y) in bbox]
        
        # Draw the bounding box on the image
        cv2.polylines(image, [np.array(bbox)], isClosed=True, color=(0, 255, 0), thickness=2)
        
        # Draw the text on the image
        cv2.putText(image, f"{text} ({confidence:.2f})", (bbox[0][0], bbox[0][1] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

    # Convert image from BGR to RGB for display with matplotlib
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Display the image with bounding boxes and text using matplotlib
    plt.figure(figsize=(30, 15))  # Adjust size as needed
    plt.imshow(image_rgb)
    plt.axis('off')  # Hide axes
    plt.show()

In [None]:
def extract_text_from_image(image_path, extracted_text_path, lang = "fr"):
    reader = easyocr.Reader([lang])
    extract_info = reader.readtext(image_path)

    extracted_text = ""
    for el in extract_info:
        extracted_text += el[1] + "\n"

    # Save the text to a file
    with open(extracted_text_path, 'w') as file:
        file.write(extracted_text)

## Extract pages from flyer and run OCR on each page

In [None]:
pdf_path = '../data/flyer.pdf'
extracted_images_folder = '../outputs/extracted_images'
convert_pdf_to_images(pdf_path, extracted_images_folder)

In [None]:
for page_num in range(26):
    image_path = f"../outputs/extracted_images/page_{page_num + 1}.png"
    extracted_text_path = f"../outputs/extracted_text/page_e_{page_num + 1}.txt"
    extract_text_from_image(image_path, extracted_text_path, 'en')

## Extract product using local model or runing on Groq

In [78]:
flyer_content = []
for page_num in range(26):
    file_path = f"../outputs/extracted_text/page_e_{page_num + 1}.txt"
    with open(file_path, 'r') as file:
        flyer_content.append(file.read())

In [79]:
len(flyer_content)


26

In [87]:
model_local = OllamaLLM(model="llama3",
                   temperature=0.0)

model_groq=ChatGroq(groq_api_key=groq_api_key,
             model_name="llama-3.1-70b-versatile",
             temperature=0.0)


# Define your desired data structure.
class ExtractedProducts(BaseModel):
    food_products: list[str] = Field(description="List of all the food products in English.")
    other_products: Optional[list[str]] = Field(description="List of any products other than food products in English.")
    reasonings: Optional[str] = Field(description="Reasoning for the categorization.", default=None)


# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=ExtractedProducts)


template = """Your task is to analyze the text and extract food items and other products mentioned in English.
Translate any non-English items to English. Only include items that can be used for cooking or consumption in the 'food_products' list. Exclude any non-food items like cleaning products, clothes, or any other items that cannot be used for cooking.

**Food products** are items that are edible or can be used in preparing meals. Examples include:
- Fruits and vegetables (e.g., 'Apple', 'Carrot')
- Dairy products (e.g., 'Milk', 'Cheese')
- Meat and poultry (e.g., 'Chicken', 'Beef')
- Grains and cereals (e.g., 'Rice', 'Oats')

**Non-food products** include items that are not used for cooking or consumption. Examples include:
- Cleaning products (e.g., 'Detergent', 'Cleaner')
- Clothing (e.g., 'Shirt', 'Jacket')
- Miscellaneous non-food items (e.g., 'Packaging', 'Discount Coupons')

Provide your response in the following format:

{format_instructions}

Text to analyze:
{query}
"""


prompt = PromptTemplate(
    template=template,
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

prompt_and_model = prompt | model_groq

for i in range(0,len(flyer_content)):
    print(f"processing page_{i + 1}")
    extracted_products_path = f"../outputs/extracted_products/page_{i + 1}.txt"
    output = prompt_and_model.invoke({"query": flyer_content[i]})
    extracted_product_p = parser.invoke(output)
    print(extracted_product_p)

    # Save the text to a file
    with open(extracted_products_path, 'w') as file:
        file.write(", ".join(extracted_product_p.food_products))


processing page_12
food_products=['Shortbread', 'Tart', 'Fruits', 'Strawberries'] other_products=[] reasonings="The food products were identified based on their English translations and their categorization as edible items. 'Boulangerie' is a bakery, but it is not a food product itself. 'Tarte sablee' is a type of tart, and 'fruits frais' translates to 'fresh fruits'. 'Fraises' is the French word for 'strawberries'. 'Shortbread' and 'tart' are direct English translations of the French text."


In [88]:
extracted_product = []
for page_num in range(26):
    file_path = f"../outputs/extracted_products/page_{page_num + 1}.txt"
    with open(file_path, 'r') as file:
        extracted_product.append(file.read())
        
for i in range(len(extracted_product)):
  print("*"*20, "page ", i+1, "*"*20)
  print(extracted_product[i])


******************** page  1 ********************
Ketchup, Tomato Ketchup, Gelato, Ice Cream, Pork Back Ribs, Veal, Pork, Natural Spring Water, Pepsi, Coca-Cola, Hamburger, Hot Dog, Sausages, Bagels, Muffins, Iceberg Lettuce, Broccoli, Blueberries, Corona Beer, Sapporo Beer
******************** page  2 ********************
BACON, SAUCISSES FRAICHES, TOMATES ROUGES DE SERRE, GREENHOUSE RED TOMATOES, WHITE EGGS, PORK & BEEF, YOGOURT, SKYR, CEREALES, JUS DE FRUITS, FRUIT JUICE, NECTAR DE FRUITS, PIZZA, PAIN TRANCHE, REPS, GOURMET STEAMERS, PURE BREAKFAST, FROZEN THIN CRUST PIZZA, SLICED BREAD, MUFFIN, CORN, CARRES DE RIZ, TARTINADE, NUTELLA, FRUIT-O-LONG, FRUITSATIONS, RICE, TORTILLAS, BOISSON, ALCOOLISEE, BUD LIGHT CHELADA, WHITE CLAW, ALCOHOLIC MALT, BEVERAGE, CLAMATO
******************** page  3 ********************
Beer, Watermelon, Cherries, Ice Cream, Frozen Yogurt, Gnocchi, Pasta Sauce, Pesto, Muffins, Skillet, Red Cherries, Melon, Vanille, Cerises Rouges, Bud Light, Molson Ultra, 

## Recommand recepies using Groq

In [3]:


file_path = ( "../data/recipes/raw_unique_recipes.csv")

loader = CSVLoader(file_path=file_path)
recipes_data = loader.load()



In [4]:
len(recipes_data)

231637

In [6]:
for record in recipes_data[:200]:
    print()
    print("*"*20)
    print(record)


********************
page_content='name: arriba   baked winter squash mexican style
id: 137739
minutes: 55
contributor_id: 47892
submitted: 2005-09-16
tags: ['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']
nutrition: [51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]
n_steps: 11
steps: ['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be car

In [10]:
#embeddings=OllamaEmbeddings(model="llama3")
embeddings=OpenAIEmbeddings()
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200) 
final_documents=text_splitter.split_documents(recipes_data[:200])
vectors=FAISS.from_documents(final_documents,embeddings) 


In [106]:
llm=ChatGroq(groq_api_key=groq_api_key,
             model_name="llama-3.1-70b-versatile",
             temperature=0.0)


prompt=ChatPromptTemplate.from_template(
"""
You job is to find recipes on the provided context only.
Please find recipes from the provided context that use the given ingredients. 
If any ingredients are missing from the list, include them in the response. 
The recipes must be selected from the context only.


<context>
{context}
<context>
Questions:{input}

"""
)





prompt = ChatPromptTemplate.from_template(
"""
Your job is to find recipes from the provided context using the given ingredients.

1. **Handle the Ingredients**:
   - If there are many ingredients, prioritize the most relevant ones.
   - Filter out any non-food items before processing.

2. **Find Recipes**:
   - Search for recipes that use the provided ingredients.
   - If a recipe needs additional ingredients, include those as well.

3. **Provide Details**:
   - For each recipe, include any additional ingredients needed, the total calories, and the preparation time.
   - Ensure that all recipes are selected from the context only.

<context>
{context}
<context>
Questions: {input}
"""
)





In [107]:
cleaned_products = """ 
ground beef, zereshk, rice, mango, potato, goat, cranberry, Nectarines, Avocados, beans
"""

cleaned_products = extracted_product[8] + extracted_product[20]


In [91]:
cleaned_products

'Nectarines, Peaches, Apricots, Plums, Avocados, Pears, Pitahaya, Rambutans, Cherries, Ground Cherries, Passion FruitCREAM-STYLE CORN, SPAGHETTINI, VEGETABLES, PASTA, MAYONNAISE, COMPOTE DE FRUITS, COMPOTE, CHICKEN NOODLE, RICE PUDDING, SOUP, SOUR CREAM, COTTAGE CHEESE, REFRIGERATED PUDDING, JUS, BOISSON, COCKTAIL AUX FRUITS, THE GLACE, FRUIT, LASAGNA, POULET, MOZZARELLA, TACOS, FROZEN MEAL, CHEESE, MEAL KIT, SALSA, MIXED LEGUMES, NOUILLES, BABYBEL, CHICKEN'

In [108]:
user_message = f"""
I would like to find 3 different recipes using the following ingredients: {cleaned_products}. 
Please extract recipes directly from the provided context and include any additional ingredients required. 
Also, provide the total calories and preparation time for each recipe. If any of the ingredients are not used in the recipes, list them separately.
"""

user_message = f"""
Here is a list of ingredients: {cleaned_products}. 

1. **Prioritize**: Focus on the most relevant ingredients if the list is too long.
2. **Find Recipes**: Look for recipes in the context that use these ingredients.
3. **Provide Recipe Details**: Include any additional ingredients needed, the total calories, and the preparation time for each recipe.

If there are non-food items or less relevant ingredients, please ignore or filter them out.
"""



document_chain=create_stuff_documents_chain(llm,prompt)
retriever=vectors.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
response=retrieval_chain.invoke({'input':user_message})

In [109]:
for key, value in response.items():
    print()
    print("*"*20)
    print(key)
   # print(value)


********************
input

********************
context

********************
answer


In [110]:
response["context"]

[Document(metadata={'source': '../data/recipes/raw_unique_recipes.csv', 'row': 132}, page_content="ingredients: ['stewing beef', 'stewing pork', 'white onion', 'bell peppers', 'habanero pepper', 'garlic', 'beans', 'chunky salsa', 'tomato paste', 'beef broth', 'tortilla chips', 'chicken bouillon cube', 'beef bouillon cube', 'sazon goya', 'cinnamon', 'mexican chili powder', 'cumin', 'ground coriander', 'black pepper', 'salt', 'light brown sugar', 'dark chocolate chips']\nn_ingredients: 22"),
 Document(metadata={'source': '../data/recipes/raw_unique_recipes.csv', 'row': 69}, page_content="description: this is from\ningredients: ['carrots', 'butter', 'onion', 'sliced mushrooms', 'zucchini', 'celery', 'green pepper', 'cloves', 'tomatoes', 'chili powder', 'ground cumin', 'oregano', 'cayenne pepper', 'flour', 'water', 'monterey jack cheese', 'salt', 'all-purpose flour', 'baking powder', 'vegetable shortening', 'soy sauce', 'egg']\nn_ingredients: 22"),
 Document(metadata={'source': '../data/re

In [111]:
for answer in response["answer"].split("\n\n"):
    print(answer)

Based on the provided ingredients, I will prioritize the most relevant ones and filter out any non-food items. Here are the top ingredients:
* Fruits: Nectarines, Peaches, Apricots, Plums, Avocados, Pears, Pitahaya, Rambutans, Cherries, Ground Cherries, Passion Fruit
* Other relevant ingredients: Mayonnaise, Sour Cream, Cottage Cheese, Salsa, Cheese
After searching the context, I found a few recipes that use these ingredients. Here are the results:
**Recipe 1: Fruit Compote**
* Ingredients: Nectarines, Peaches, Apricots, Plums, Pears, Cherries
* Additional ingredients: Sugar, Water
* Total calories: approximately 150-200 per serving
* Preparation time: 15-20 minutes
This recipe is a simple fruit compote that can be served as a dessert or a topping for yogurt or oatmeal.
**Recipe 2: Avocado and Sour Cream Salad**
* Ingredients: Avocados, Sour Cream, Mayonnaise
* Additional ingredients: Salt, Pepper, Lemon juice
* Total calories: approximately 300-400 per serving
* Preparation time: 10-1

In [34]:
for doc_source in response["context"]:
    print(doc_source)

page_content='ingredients: ['stewing beef', 'stewing pork', 'white onion', 'bell peppers', 'habanero pepper', 'garlic', 'beans', 'chunky salsa', 'tomato paste', 'beef broth', 'tortilla chips', 'chicken bouillon cube', 'beef bouillon cube', 'sazon goya', 'cinnamon', 'mexican chili powder', 'cumin', 'ground coriander', 'black pepper', 'salt', 'light brown sugar', 'dark chocolate chips']
n_ingredients: 22' metadata={'source': '../data/recipes/raw_unique_recipes.csv', 'row': 132}
page_content='ingredients: ['salad greens', 'green onions', 'salt', 'pepper', 'salad seasoning', 'carrots', 'fresh corn kernels', 'green peppers', 'beets', 'white mushrooms', 'gingerroot', 'rice wine vinegar', 'honey', 'garlic clove', 'soy sauce', 'safflower oil']
n_ingredients: 16' metadata={'source': '../data/recipes/raw_unique_recipes.csv', 'row': 184}
page_content='description: this is from
ingredients: ['carrots', 'butter', 'onion', 'sliced mushrooms', 'zucchini', 'celery', 'green pepper', 'cloves', 'tomatoes