In [3]:
 !pip install sentence-transformers transformers datasets scikit-learn

Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv('/content/McDonaldsMenuNutritionV2.csv')  # Adjust the path to your dataset

# Check the dataset structure
print(df.head())

# Combine the item name and nutrition information to create a context
df['context'] = df['Item'] + ": " + df['Calories'].astype(str) + " calories, " + \
                df['Total Fat (g)'].astype(str) + "g total fat, " + \
                df['Protein (g)'].astype(str) + "g protein."

# Load the Sentence Transformer model for embeddings
retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the combined context of the items
embeddings = retrieval_model.encode(df['context'].tolist())

# Load the T5 model and tokenizer for answer generation
generator_model = T5ForConditionalGeneration.from_pretrained('t5-base')
generator_tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Retrieve the most relevant document
def retrieve_document(query):
    query_embedding = retrieval_model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)
    top_index = similarities[0].argmax()  # Get the index of the most similar document
    return df['context'].iloc[top_index]  # Return the full context of the best match

# Generate an answer using the T5 model
def generate_answer(query, retrieved_doc):
    input_text = f"question: {query} context: {retrieved_doc}"
    input_ids = generator_tokenizer.encode(input_text, return_tensors='pt', truncation=True)

    with torch.no_grad():  # Save memory during inference
        output_ids = generator_model.generate(input_ids, max_length=50)

    return generator_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Main function to answer a query
def answer_query(query):
    retrieved_doc = retrieve_document(query)
    answer = generate_answer(query, retrieved_doc)
    return answer

# Interactive loop for user input
def interactive_rag_system():
    print("Welcome to the Interactive RAG System!")
    while True:
        user_input = input("Enter your query (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        else:
            try:
                answer = answer_query(user_input)
                print(f"Answer: {answer}")
            except Exception as e:
                print(f"An error occurred: {e}")

# Run the Interactive RAG System
if __name__ == "__main__":
    interactive_rag_system()


                            Item  Calories  Calories from Fat  Total Fat (g)  \
0                      Hamburger       250               80.0            9.0   
1                   Cheeseburger       300              110.0           12.0   
2            Double Cheeseburger       440              210.0           23.0   
3                       McDouble       390              170.0           19.0   
4   Quarter Pounder® with Cheese       510              230.0           26.0   

  Saturated Fat (g)  Trans Fat (g)  Cholesterol (mg)  Sodium (mg)  Carbs (g)  \
0               3.5            0.5                25          520         31   
1                 6            0.5                40          750         33   
2                11            1.5                80         1150         34   
3                 8            1.0                65          920         33   
4                12            1.5                90         1190         40   

   Fiber (g)  Sugars (g)  Protein (g) 

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Welcome to the Interactive RAG System!
Enter your query (or type 'exit' to quit): How much protein does a Quarter Pounder with Cheese have?


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Answer: 29g
Enter your query (or type 'exit' to quit): Tell me about the nutrition facts for a McDouble.
Answer: 390 calories, 19.0g total fat, 22g protein
Enter your query (or type 'exit' to quit): exit
