In [28]:
!pip install -q langchain langchain-openai openai chromadb gradio python-dotenv tiktoken langchain-community


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
!pip install --upgrade openai
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(base_url="https://openrouter.ai/api/v1"
                       ,api_key=openai_api_key)
print("Successfully connected to OpenRouter")
print(openai_api_key[:5])




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Successfully connected to OpenRouter
sk-or


In [30]:
# Let's import Langchain components
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQAWithSourcesChain

In [31]:
DATA_FILE_PATH = "eleven_madison_park_data.txt"
print(f"Data file path set to: {DATA_FILE_PATH}")

Data file path set to: eleven_madison_park_data.txt


In [32]:
# Let's load Eleven Madison Park Restaurant data, which has been scraped from their website
# The data is saved in "eleven_madison_park_data.txt", Langchain's TextLoader makes this easy to read
print(f"Attempting to load data from: {DATA_FILE_PATH}")

# Initialize the TextLoader with the file path and specify UTF-8 encoding
# Encoding helps handle various characters correctly
loader = TextLoader(DATA_FILE_PATH, encoding = "utf-8")

# Load the document(s) using TextLoader from LangChain, which loads the entire file as one Document object
raw_documents = loader.load()
print(f"Successfully loaded {len(raw_documents)} document(s).")


Attempting to load data from: eleven_madison_park_data.txt
Successfully loaded 1 document(s).


In [33]:
# Let's split the document into chunks
print("\nSplitting the loaded document into smaller chunks...")

# Let's initialize the splitter, which tries to split the document on common separators like paragraphs (\n\n),
# sentences (.), and spaces (' ').
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,  # Aim for chunks of about 1000 characters
                                               chunk_overlap = 150,)  # Each chunk overlaps with the previous by 150 characters

# Split the raw document(s) into smaller Document objects (chunks)
documents = text_splitter.split_documents(raw_documents)

# Check if splitting produced any documents
if not documents:
    raise ValueError("Error: Splitting resulted in zero documents. Check the input file and splitter settings.")
print(f"Document split into {len(documents)} chunks.")



Splitting the loaded document into smaller chunks...
Document split into 38 chunks.


In [34]:
documents

[Document(metadata={'source': 'eleven_madison_park_data.txt'}, page_content='Source: https://www.elevenmadisonpark.com/\nTitle: Eleven Madison Park\nContent:\nBook on Resy\n---END OF SOURCE---'),
 Document(metadata={'source': 'eleven_madison_park_data.txt'}, page_content='Source: https://www.elevenmadisonpark.com/careers\nTitle: Careers — Eleven Madison Park\nContent:'),
 Document(metadata={'source': 'eleven_madison_park_data.txt'}, page_content="Join Our Team Eleven Madison Park ▾ All Businesses Eleven Madison Park Clemente Bar Daniel Humm Hospitality Filter Categories Culinary Pastry Wine & Beverage Dining Room Office & Admin Other Job Types Full Time Part Time Compensation Salary Hourly Apply filters OPEN OPPORTUNITIES Staff Accountant - Part Time Eleven Madison Park Part Time • Hourly ($20 - $25) Host/Reservationist Eleven Madison Park Full Time • Hourly ($24) Sous Chef Eleven Madison Park Full Time • Salary ($72K - $75K) Pastry Cook Eleven Madison Park Full Time • Hourly ($18 - $2

In [35]:
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from openai import OpenAI
from langchain_chroma import Chroma

# Let's initialize our embeddings model. Note that we will use OpenAI's embedding model

print("Initializing free HuggingFace Embeddings model...")

class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        return self.model.encode([text])[0].tolist()


# ✅ Free model load kar (fast + small)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vector_store = Chroma.from_documents(documents = documents, embedding = embeddings)

# Verify the number of items in the store
vector_count = vector_store._collection.count()
print(f"ChromaDB vector store created with {vector_count} items.")

if vector_count == 0:
    raise ValueError("Vector store creation resulted in 0 items. Check previous steps.")

Initializing free HuggingFace Embeddings model...
ChromaDB vector store created with 42 items.


In [41]:
# Let's retrieve the first chunk of stored data from the vector store
stored_data = vector_store._collection.get(include=["embeddings", "documents"], limit = 1)

# Display the results
print("First chunk text:\n", stored_data['documents'][0])
print("\nEmbedding vector:\n", stored_data['embeddings'][0])
print(f"\nFull embedding has {len(stored_data['embeddings'][0])} dimensions.")

First chunk text:
 Apple makes iPhones.

Embedding vector:
 [-2.77918037e-02  1.61824822e-02  5.63803725e-02  3.45895439e-03
 -1.85915790e-02 -5.31174429e-02  3.41965929e-02  7.62359565e-03
  7.41090998e-02  3.00534647e-02  6.53498247e-02  5.24698868e-02
 -8.22963845e-03  1.70048736e-02  1.13172010e-02 -1.97180975e-02
 -1.18543757e-02 -4.17266274e-04 -2.95869317e-02 -8.95853527e-03
  1.80197619e-02  1.28979888e-02  3.54621634e-02  3.24524119e-02
  8.27700421e-02  8.10092911e-02 -2.66854130e-02 -7.71054998e-02
  8.90258607e-03 -2.24060547e-02 -2.45940406e-02 -1.19534396e-02
  5.95315881e-02  6.48715794e-02 -2.99979094e-02 -5.67107536e-02
  4.72821631e-02  2.82537453e-02  4.30075601e-02 -5.20041659e-02
 -2.97093578e-02 -9.54102911e-03  3.47380787e-02  9.33602080e-02
 -1.89554449e-02  9.89151001e-03  5.01697361e-02  2.48636724e-03
  4.24956111e-03  1.75619032e-02  2.22722888e-02  3.42339277e-04
 -6.64007738e-02 -9.63471271e-03 -5.25200274e-04  2.70081256e-02
 -3.66409086e-02  7.32263364e-

In [42]:
# Let's perform a similarity search in our vector store
print("\n--- Testing Similarity Search in Vector Store ---")
test_query = "What different menus are offered?"
print(f"Searching for documents similar to: '{test_query}'")


# Perform a similarity search. 'k=2' retrieves the top 2 most similar chunks
try:
    similar_docs = vector_store.similarity_search(test_query, k = 2)
    print(f"\nFound {len(similar_docs)} similar documents:")

    # Display snippets of the retrieved documents and their sources
    for i, doc in enumerate(similar_docs):
        print(f"\n--- Document {i+1} ---")
        # Displaying the first 700 chars for brevity
        content_snippet = doc.page_content[:700].strip() + "..."
        source = doc.metadata.get("source", "Unknown Source")  # Get source from metadata
        print(f"Content Snippet: {content_snippet}")
        print(f"Source: {source}")

except Exception as e:
    print(f"An error occurred during similarity search: {e}")




--- Testing Similarity Search in Vector Store ---
Searching for documents similar to: 'What different menus are offered?'

Found 2 similar documents:

--- Document 1 ---
Content Snippet: FAQs We are located at 11 Madison Avenue, on the northeast corner of East 24th and Madison Avenue, directly across the street from Madison Square Park. We offer three menus, all 100% plant-based: Full Tasting Menu : An eight- to nine-course experience priced at $365 per guest. This menu typically lasts about two to three hours and features a mix of plated and communal dishes. 5-Course Menu : Priced at $285 per guest, this menu highlights selections from the Full Tasting Menu and lasts approximately two hours. Bar Tasting Menu : Available in our lounge for $225 per guest, this menu includes four to five courses and is designed to last around two hours. Note : These durations are estimates bas...
Source: eleven_madison_park_data.txt

--- Document 2 ---
Content Snippet: and ingredients. He and his team n