In [1]:
!pip install rdflib



In [2]:
!pip install sentence-transformers faiss-cpu transformers

Note: you may need to restart the kernel to use updated packages.


In [37]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import rdflib
from rdflib import Namespace
from rdflib.namespace import RDF, RDFS
import json

In [4]:
# issues in the knowledge graph as of now
"""
1. Missing ingredient details 
2. Missing flavor molecule details
"""

'\n1. Missing ingredient details \n2. Missing flavor molecule details\n'

In [5]:
DBO = Namespace("https://cosylab.iiitd.edu.in/foodontology/")
SCHEMA = Namespace("http://schema.org/")

g = rdflib.Graph()
g.parse("updated_foodontology_with_instances.rdf")

def get_label_for_uri(uri, graph):
    """
    Finds a human-readable rdfs:label for a URI. If none exists, it
    falls back to cleaning the URI string itself.
    """
    if not isinstance(uri, rdflib.term.URIRef):
        return str(uri)
    label = graph.value(subject=uri, predicate=RDFS.label)
    if label:
        return str(label)
    if '#' in str(uri):
        return str(uri).split('#')[-1]
    return str(uri).split('/')[-1].replace('_', ' ')

recipe_entities = {s for s, p, o in g if p == RDF.type and o == DBO.Recipe}

ingredient_entities = {s for s, p, o in g if p == DBO.ingredientID}

print(f"Found {len(recipe_entities)} recipes and {len(ingredient_entities)} ingredients.")

recipe_documents = []
ingredient_documents = []
ingredient_ids = []

for entity in recipe_entities:
    document_parts = []
    entity_label = get_label_for_uri(entity, g)

    for s, p, o in g.triples((entity, None, None)):
        subj_text = entity_label
        pred_text = get_label_for_uri(p, g)
        obj_text = get_label_for_uri(o, g)
        document_parts.append(f"{subj_text} {pred_text} {obj_text}.")

        if p == SCHEMA.recipeInstructions and isinstance(o, rdflib.term.URIRef):
            description_text = g.value(subject=o, predicate=RDFS.comment)
            if description_text:
                document_parts.append(f"{subj_text} hasFullDescription \"{str(description_text)}\".")

    recipe_documents.append(" ".join(document_parts))

for entity in ingredient_entities:
    document_parts = []
    entity_label = get_label_for_uri(entity, g)

    ing_id = g.value(subject=entity, predicate=DBO.ingredientID)
    if ing_id:
        ingredient_ids.append(str(ing_id))

    for s, p, o in g.triples((entity, None, None)):
        subj_text = entity_label
        pred_text = get_label_for_uri(p, g)
        obj_text = get_label_for_uri(o, g)
        document_parts.append(f"{subj_text} {pred_text} {obj_text}.")
    
    ingredient_documents.append(" ".join(document_parts))

if recipe_documents:
    print("\n--- Generated Recipe Document Sample ---")
    print(f"- {recipe_documents[0]}\n")

if ingredient_documents:
    print("\n--- Generated Ingredient Document Sample ---")
    print(f"- {ingredient_documents[0]}\n")
    if len(ingredient_documents) > 1:
        print(f"- {ingredient_documents[1]}\n")

if not recipe_documents and not ingredient_documents:
    print("No recipe or ingredient documents were generated. Please check your namespaces.")

Found 118083 recipes and 466 ingredients.

--- Generated Recipe Document Sample ---
- Ratatouille Stuffed Red Bell Peppers type Recipe. Ratatouille Stuffed Red Bell Peppers recipeID 132655. Ratatouille Stuffed Red Bell Peppers label Ratatouille Stuffed Red Bell Peppers. Ratatouille Stuffed Red Bell Peppers recipeInstructions Description 132655. Ratatouille Stuffed Red Bell Peppers hasFullDescription "['Preheat the oven to 400Â°f. Halve 4 of the peppers lengthways, de seed and place on a large baking tray. Cook the rice in boiling water according to pack instructions then drain. Meanwhile, fry the onion in 1 tbsp oil for 5 minutes finely dice the remaining pepper and add to the onions with the zucchini and cook for 2 3 minutes stir in the tomatoes, season well and cook for a further 5 minutes. Stir the rice into the vegetable mixture and spoon into the halved peppers. Top with the breadcrumbs and bake for 15 20 mins until browned. Serve with a fresh green salad.']". Ratatouille Stuffed 

In [13]:
all_documents = recipe_documents + ingredient_documents

metadata = (
    [{'type': 'recipe', 'source_index': i} for i in range(len(recipe_documents))] +
    [{'type': 'ingredient', 'source_index': i} for i in range(len(ingredient_documents))]
)

print(f"Total documents to be indexed: {len(all_documents)}")

Total documents to be indexed: 118549


In [21]:
model = SentenceTransformer('all-MiniLM-L6-v2') 
batch_size = 1024
all_embeddings = []
print(f"\nGenerating embeddings in batches of {batch_size}...")

for i in range(0, len(all_documents), batch_size):
    batch = all_documents[i : i + batch_size]
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    all_embeddings.extend(batch_embeddings)
    print(f"  Processed {i + len(batch)} / {len(all_documents)} documents")

doc_embeddings = np.array(all_embeddings)
print(f"\nEmbeddings generated successfully. Shape: {doc_embeddings.shape}")


Generating embeddings in batches of 1024...
  Processed 1024 / 118549 documents
  Processed 2048 / 118549 documents
  Processed 3072 / 118549 documents
  Processed 4096 / 118549 documents
  Processed 5120 / 118549 documents
  Processed 6144 / 118549 documents
  Processed 7168 / 118549 documents
  Processed 8192 / 118549 documents
  Processed 9216 / 118549 documents
  Processed 10240 / 118549 documents
  Processed 11264 / 118549 documents
  Processed 12288 / 118549 documents
  Processed 13312 / 118549 documents
  Processed 14336 / 118549 documents
  Processed 15360 / 118549 documents
  Processed 16384 / 118549 documents
  Processed 17408 / 118549 documents
  Processed 18432 / 118549 documents
  Processed 19456 / 118549 documents
  Processed 20480 / 118549 documents
  Processed 21504 / 118549 documents
  Processed 22528 / 118549 documents
  Processed 23552 / 118549 documents
  Processed 24576 / 118549 documents
  Processed 25600 / 118549 documents
  Processed 26624 / 118549 documents
  

In [23]:
d = doc_embeddings.shape[1] 
index = faiss.IndexFlatL2(d)
index.add(np.array(doc_embeddings, dtype=np.float32))
print(f"\nFAISS index created with {index.ntotal} vectors.")


FAISS index created with 118549 vectors.


In [39]:
# saving the vector database
faiss.write_index(index, "faiss_index.index")
with open("all_documents.json", "w") as f:
    json.dump(all_documents, f)
print("FAISS index and documents have been saved successfully.")

FAISS index and documents have been saved successfully.


In [31]:
llm = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')

def answer_query(query, k=3):
    """
    Retrieves the most relevant documents and uses them to answer a query.
    """
    # 1. RETRIEVE
    # Embed the user's query
    query_embedding = model.encode([query])
    
    # Search the FAISS index for the top-k most similar document embeddings
    distances, indices = index.search(np.array(query_embedding, dtype=np.float32), k)
    
    # Get the corresponding text documents
    retrieved_docs_text = [all_documents[i] for i in indices[0]]
    
    print(f"\n--- Query: '{query}' ---")
    print(f"Retrieved {len(retrieved_docs_text)} documents.")
    for doc in retrieved_docs_text:
        print(doc);
        print("\n")

    # 2. AUGMENT
    # Combine the retrieved documents into a single context string
    context = " ".join(retrieved_docs_text)
    
    # 3. GENERATE
    # Pass the context and question to the LLM
    result = llm(question=query, context=context)
    
    return result['answer']

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [41]:
complex_query = "What is the country of origin for Cafe Rio-Like Pork?"
answer = answer_query(complex_query)

print("\n--- Final Answer ---")
print(answer)


--- Query: 'What is the country of origin for Cafe Rio-Like Pork?' ---
Retrieved 3 documents.
Cafe Rio Pork type Recipe. Cafe Rio Pork recipeID 81706. Cafe Rio Pork label Cafe Rio Pork. Cafe Rio Pork recipeInstructions Description 81706. Cafe Rio Pork hasFullDescription "['Roast in crock pot for 5 6 hours. Shred meat. Cook one more hour. Serve with cafe rio beans, rice, and tomatillo dressing.']". Cafe Rio Pork schema containedIn latin american. Cafe Rio Pork schema containedIn mexican. Cafe Rio Pork schema countryOfOrigin mexican.


Cafe Rio-Like Pork (Copycat) type Recipe. Cafe Rio-Like Pork (Copycat) recipeID 81600. Cafe Rio-Like Pork (Copycat) label Cafe Rio-Like Pork (Copycat). Cafe Rio-Like Pork (Copycat) recipeInstructions Description 81600. Cafe Rio-Like Pork (Copycat) hasFullDescription "['Spray crock pot. Add pork roast. Mix remaining ingredients and pour over top. Cook in crock pot on low for 10 hours. Can add more beef broth if needed. Shred with fork.']". Cafe Rio-Like Po