## HomeMatch App

In [1]:
from pathlib import Path
import json, os, time
from typing import List
import json, time
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory

In [2]:
import os
from dotenv import load_dotenv
load_dotenv() #load all the env variables

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [4]:
# --- 1. model --------------------------------------------------------------
llm_groq = ChatGroq(
    model_name="gemma2-9b-it",  # or another Groq model
    temperature=0.8,
    max_tokens=512   # plenty for one JSON listing
)

llm_groq.invoke("Hello")

AIMessage(content='Hello! 👋  How can I help you today? 😊\n', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 11, 'total_tokens': 26, 'completion_time': 0.027272727, 'prompt_time': 0.0019656, 'queue_time': 0.16043023499999998, 'total_time': 0.029238327}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None}, id='run--5c61af66-5a75-4579-818d-93c465c3b3b4-0', usage_metadata={'input_tokens': 11, 'output_tokens': 15, 'total_tokens': 26})

In [6]:
#Load the listings
# 2. -------------- Load listings.json -------------------------------------
LISTING_FILE = "./listings.json"

with open(LISTING_FILE, "r") as fp:
    raw_listings = json.load(fp) 

raw_listings[0]

{'neighborhood': 'Oakwood Estates',
 'price': 650000,
 'bedrooms': 4,
 'bathrooms': 3,
 'house_size': '2500 sqft',
 'description': "Welcome to your dream home! This beautifully renovated 4 bedroom, 3 bathroom residence offers modern elegance and spacious living. Enjoy the gourmet kitchen, perfect for entertaining, and relax in the tranquil master suite. The large backyard is an oasis, ideal for summer gatherings. Don't miss this opportunity to own a slice of paradise.",
 'neighborhood_description': 'Oakwood Estates is a peaceful and family-friendly community with tree-lined streets, top-rated schools, and convenient access to parks, shopping, and dining.  Enjoy a sense of community and a tranquil lifestyle in this beautiful neighborhood.'}

In [7]:
#Loading an embedding model from huggingface
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cpu"}  # or "cuda" if GPU is available
)
embedding = embedding_model.embed_query("I am looking for a wonderful house in a cozy neighboorhood")
len(embedding)

  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


768

In [None]:
# -----------------------------------------
# 1.  Transform raw_listings → Documents
# -----------------------------------------
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
import os, json

docs: list[Document] = []

for item in raw_listings:
    # Combine the prose fields into the text that will be embedded
    text_block = "\n\n".join(
        [
            item.get("description", "").strip(),
            item.get("neighborhood_description", "").strip(),
        ]
    ).strip()

    # Everything else becomes structured metadata for filtering/ranking later
    metadata = {k: v for k, v in item.items() if k not in ("description", "neighborhood_description")}

    docs.append(Document(page_content=text_block, metadata=metadata))

print(f"✅ Converted {len(docs)} listings to LangChain Documents")

# -----------------------------------------
# 2.  Create (or reopen) the Chroma store
# -----------------------------------------
PERSIST_DIR = "db/listings_chroma"
os.makedirs(PERSIST_DIR, exist_ok=True)

vector_store = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,      
    collection_name="listings",
    persist_directory=PERSIST_DIR,
)

vector_store.persist()             # writes the DB files to disk
print(f"💾 Stored & persisted to {PERSIST_DIR}")

✅ Converted 50 listings to LangChain Documents
💾 Stored & persisted to db/listings_chroma


  vector_store.persist()             # writes the DB files to disk


In [9]:
# -------------------------------------------------
# Create a retriever from the existing Chroma store
# -------------------------------------------------
retriever = vector_store.as_retriever(
    search_kwargs={
        "k": 4          # number of listings to return on each query (tweak as you like)
        # you can also add a metadata filter later, e.g.:
        # "filter": {"bedrooms": 3}
    }
)

retriever.invoke("a nice house with 3 bedrooms, 3000 sqft in a cozy neighboorhood that does not cost more than 500000")

[Document(metadata={'house_size': '2500 sqft', 'price': 525000, 'neighborhood': 'Parkview Estates', 'bathrooms': 3, 'bedrooms': 4}, page_content='This charming 4-bedroom, 3-bathroom home offers a spacious open floor plan perfect for entertaining.  The gourmet kitchen boasts stainless steel appliances and granite countertops. Enjoy relaxing evenings in the cozy living room with a fireplace. The master suite features a walk-in closet and private bath.  The backyard is an oasis with a deck and plenty of space for gardening.\n\nParkview Estates is a picturesque neighborhood known for its mature trees, winding streets, and friendly atmosphere. Residents enjoy easy access to parks, walking trails, and top-rated schools. This desirable location is also close to shopping, dining, and entertainment.'),
 Document(metadata={'price': 425000, 'bedrooms': 3, 'bathrooms': 2, 'house_size': '1800 sqft', 'neighborhood': 'Pleasant Valley'}, page_content='This charming 3-bedroom, 2-bathroom home offers a 

In [10]:
from pydantic import BaseModel, Field
from typing import Optional, List
from langchain_core.runnables import RunnableLambda

# Defining a pydantic output parser for our query cleaning LLM
class BuyerPreferences(BaseModel):
    bedrooms: Optional[int] = Field(None, description="Number of bedrooms")
    bathrooms: Optional[int] = Field(None, description="Number of bathrooms")
    house_size: Optional[str] = Field(None, description="Desired house size (e.g. '2000 sqft')")
    amenities: Optional[List[str]] = Field(None, description="Desired amenities (e.g. backyard, solar panels)")
    transportation: Optional[List[str]] = Field(None, description="Transportation preferences (e.g. bike paths, public transit)")
    neighborhood_traits: Optional[List[str]] = Field(None, description="Neighborhood qualities (e.g. quiet, walkable)")
    price_range: Optional[str] = Field(None, description="Approximate price range or budget")
    lifestyle: Optional[str] = Field(None, description="Lifestyle fit, e.g. remote work, family-friendly")
    query: str = Field(..., description="One concise summary sentence combining the above preferences, the summary should be optimize for similarity search in avector database")

# Define the parser
#cleaning_parser = PydanticOutputParser(pydantic_object=BuyerPreferences)

#wrap the function in a runnableLambda
extract_query = RunnableLambda(lambda prefs: {"input" : prefs.query})

# Bind the parser to the LLM
cleaning_llm = llm_groq.with_structured_output(BuyerPreferences)

In [27]:
#Loading an embedding model from huggingface
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cpu"}  # or "cuda" if GPU is available
)
embedding = embedding_model.embed_query("I am looking for a wonderful house in a cozy neighboorhood")
len(embedding)

768

In [None]:
#Load the listings
# 2. -------------- Load listings.json -------------------------------------
LISTING_FILE = "./listings.json"

with open(LISTING_FILE, "r") as fp:
    raw_listings = json.load(fp) 

In [14]:
from pydantic import BaseModel, Field
from typing import Optional, List
from langchain_core.runnables import RunnableLambda

# Defining a pydantic output parser for our query cleaning LLM
class BuyerPreferences(BaseModel):
    bedrooms: Optional[int] = Field(None, description="Number of bedrooms")
    bathrooms: Optional[int] = Field(None, description="Number of bathrooms")
    house_size: Optional[str] = Field(None, description="Desired house size (e.g. '2000 sqft')")
    amenities: Optional[List[str]] = Field(None, description="Desired amenities (e.g. backyard, solar panels)")
    transportation: Optional[List[str]] = Field(None, description="Transportation preferences (e.g. bike paths, public transit)")
    neighborhood_traits: Optional[List[str]] = Field(None, description="Neighborhood qualities (e.g. quiet, walkable)")
    price_range: Optional[str] = Field(None, description="Approximate price range or budget")
    lifestyle: Optional[str] = Field(None, description="Lifestyle fit, e.g. remote work, family-friendly")
    query: str = Field(..., description="One concise summary sentence combining the above preferences, the summary should be optimize for similarity search in avector database")

# Define the parser
#cleaning_parser = PydanticOutputParser(pydantic_object=BuyerPreferences)

#wrap the function in a runnableLambda
extract_query = RunnableLambda(lambda prefs: {"input" : prefs.query})

# Bind the parser to the LLM
cleaning_llm = llm_groq.with_structured_output(BuyerPreferences)

cleaning_llm.invoke("a nice house with 3 bedrooms, 3000 sqft in a cozy neighboorhood that does not cost more than 500000")


BuyerPreferences(bedrooms=3, bathrooms=3, house_size='3000 sqft', amenities=[], transportation=[], neighborhood_traits=['cozy'], price_range='500000', lifestyle=None, query='Looking for a cozy 3 bedroom, 3000 sqft house in a neighborhood that does not cost more than 500,000.')

In [12]:
#Defining the system Prompt for the cleaning LLM
system_prompt = """
You are a helpful assistant for a real estate matching app.

Your task is to extract the buyer’s home preferences from natural language
and return them as a structured JSON object.

Return ONLY a valid JSON object matching the following fields:

- bedrooms: integer (optional)
- bathrooms: integer (optional)
- house_size: string (e.g., "2000 sqft")
- amenities: array of strings (e.g., ["backyard", "solar panels"])
- transportation: array of strings (e.g., ["bike paths", “public transit”])
- neighborhood_traits: array of strings (e.g., ["quiet", "family-friendly"])
- price_range: string (e.g., "under $500,000")
- lifestyle: string (e.g., "remote work")
- summary: one clear sentence (< 40 words) summarizing all preferences

If the user doesn’t mention a field, set it to null or an empty list (for arrays).
"""

query_cleaner_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", "{raw_query}")
])
query_cleaner_prompt.invoke({"raw_query":"a nice house with 3 bedrooms, 3000 sqft in a cozy neighboorhood that does not cost more than 500000"})

ChatPromptValue(messages=[SystemMessage(content='You are a helpful assistant for a real estate matching app.\n\nYour task is to extract the buyer’s home preferences from natural language\nand return them as a structured JSON object.\n\nReturn ONLY a valid JSON object matching the following fields:\n\n- bedrooms: integer (optional)\n- bathrooms: integer (optional)\n- house_size: string (e.g., "2000 sqft")\n- amenities: array of strings (e.g., ["backyard", "solar panels"])\n- transportation: array of strings (e.g., ["bike paths", “public transit”])\n- neighborhood_traits: array of strings (e.g., ["quiet", "family-friendly"])\n- price_range: string (e.g., "under $500,000")\n- lifestyle: string (e.g., "remote work")\n- summary: one clear sentence (< 40 words) summarizing all preferences\n\nIf the user doesn’t mention a field, set it to null or an empty list (for arrays).', additional_kwargs={}, response_metadata={}), HumanMessage(content='a nice house with 3 bedrooms, 3000 sqft in a cozy n

In [15]:
#defining the user query cleaning chain
query_cleaning_chain = query_cleaner_prompt | cleaning_llm | extract_query

#testing the chain
query_cleaning_chain.invoke(
    {"raw_query": "I'd like a modern 3-bedroom around 2000 sqft, solar panels, "
                  "backyard, quiet neighborhood, near public transit. Budget about $600k."}
)

{'input': 'modern 3-bedroom house around 2000 sqft with solar panels and a backyard in a quiet neighborhood near public transit for around $600k'}

In [16]:
#defining the rag prompt template
rag_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """You are HomeMatch, an expert real-estate assistant helping buyers find ideal homes based on their preferences.

You will be given:
- A structured summary of the buyer's preferences (in natural language)
- A set of real estate listings (retrieved for semantic similarity)

Your task:
- Recommend the top 3 listings that best align with the buyer's needs
- Highlight the matching features in your explanation (e.g., size, amenities, location)
- Be concise, persuasive, and grounded in the listings provided

Only use information found in the listings. Do not invent properties or add extra features.

"Listings:\n{context}",
"""
),
    ("human", "{input}"),
])

In [17]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# 1️⃣  combine the retrieved docs + prompt + model
combine_docs_chain = create_stuff_documents_chain(
    llm=llm_groq,
    prompt=rag_prompt          
)

# 2️⃣  wire the retriever and the doc-combining chain together
rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=combine_docs_chain
)

full_chain = (query_cleaning_chain | rag_chain)

In [18]:
#invocation
results = full_chain.invoke({"raw_query": "I'd like a modern 3-bedroom around 2000 sqft, solar panels, "
                  "backyard, quiet neighborhood, near public transit. Budget about $600k."})
results

{'input': 'looking for a modern 3-bedroom home around 2000 sqft with solar panels and a backyard in a quiet neighborhood near public transit, around $600k',
 'context': [Document(metadata={'bedrooms': 3, 'house_size': '1800 sqft', 'bathrooms': 2, 'price': 425000, 'neighborhood': 'Pleasant Valley'}, page_content='This charming 3-bedroom, 2-bathroom home offers a cozy and inviting atmosphere. Natural light floods through the spacious living room, perfect for relaxing evenings. The updated kitchen boasts modern appliances and ample counter space, ideal for culinary enthusiasts.  The backyard is an oasis of tranquility, perfect for outdoor entertaining. This home is conveniently located near parks, schools, and shopping, making it an ideal choice for families.\n\nNestled in the heart of the city, Pleasant Valley offers a blend of suburban charm and urban convenience.  Residents enjoy tree-lined streets, well-maintained parks, and a vibrant community atmosphere. With easy access to downtown

In [19]:
#nice display of the results
from IPython.display import display, Markdown

def render_results(results):
    display(Markdown("### 🏡 Top Matching Listings"))
    
    for i, doc in enumerate(results["context"], start=1):
        meta = doc.metadata
        card = f"""
**Listing {i}**
- 📍 Neighborhood: `{meta.get('neighborhood', 'N/A')}`
- 🛏 Bedrooms: `{meta.get('bedrooms', 'N/A')}`
- 🛁 Bathrooms: `{meta.get('bathrooms', 'N/A')}`
- 📐 Size: `{meta.get('house_size', 'N/A')}`
- ☀️ Price: `${meta.get('price', 'N/A'):,}`

---
"""
        display(Markdown(card))

    display(Markdown("### 🤖 AI Summary"))
    display(Markdown(f"> {results['answer']}"))

render_results(results)

### 🏡 Top Matching Listings


**Listing 1**
- 📍 Neighborhood: `Pleasant Valley`
- 🛏 Bedrooms: `3`
- 🛁 Bathrooms: `2`
- 📐 Size: `1800 sqft`
- ☀️ Price: `$425,000`

---



**Listing 2**
- 📍 Neighborhood: `Westchester`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `3`
- 📐 Size: `2500 sqft`
- ☀️ Price: `$550,000`

---



**Listing 3**
- 📍 Neighborhood: `Westside`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `3`
- 📐 Size: `2500 sqft`
- ☀️ Price: `$550,000`

---



**Listing 4**
- 📍 Neighborhood: `Westwood`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `3`
- 📐 Size: `2500 sqft`
- ☀️ Price: `$850,000`

---


### 🤖 AI Summary

> Based on your preferences, here are the top 3 listings that best align with your needs:

1. **Charming 3-bedroom, 2-bathroom home:** This home checks many boxes! It has 3 bedrooms and boasts a spacious living room, perfect for a modern feel. While the listing doesn't explicitly mention solar panels or the square footage, the "cozy and inviting atmosphere" suggests a comfortable living space, and the backyard is described as an "oasis of tranquility" – perfect for relaxation. Its convenient location near parks, schools, and shopping also aligns with your desire for a quiet neighborhood with access to amenities. 

2. **This charming 4-bedroom, 3-bathroom home:** Although it has 4 bedrooms, this home's spacious open floor plan and modern kitchen with stainless steel appliances could easily accommodate your needs. The listing highlights a "private backyard oasis," fulfilling your desire for outdoor space. Its location in a quiet cul-de-sac further emphasizes the peaceful neighborhood you're looking for. 

3. **This charming 4-bedroom, 3-bathroom home in Westside:** This home offers a spacious open floor plan, a modern kitchen with stainless steel appliances, and a large backyard perfect for entertaining.  The listing also mentions the convenience of nearby parks, schools, and shopping, aligning with your preferences for a family-friendly neighborhood with access to amenities.



It's important to note that the listings don't explicitly mention solar panels or the exact square footage.  
