## HomeMatch App

In [53]:
from pathlib import Path
import json, os, time
from typing import List
import json, time
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory


In [2]:
import os
from dotenv import load_dotenv
load_dotenv() #load all the env variables

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [17]:
# --- 1. model --------------------------------------------------------------
llm_groq = ChatGroq(
    model_name="gemma2-9b-it",  # or another Groq model
    temperature=0.8,
    max_tokens=512   # plenty for one JSON listing
)

llm_groq.invoke("Hello")

AIMessage(content='Hello! How can I help you today? 😊\n', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 11, 'total_tokens': 24, 'completion_time': 0.023636364, 'prompt_time': 0.002073959, 'queue_time': 0.16197118800000002, 'total_time': 0.025710323}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None}, id='run--a51854b8-6c96-4c7e-a7c9-6d8dd952d4ae-0', usage_metadata={'input_tokens': 11, 'output_tokens': 13, 'total_tokens': 24})

In [7]:
#Load the listings
# 2. -------------- Load listings.json -------------------------------------
LISTING_FILE = "./listings/listings.json"

with open(LISTING_FILE, "r") as fp:
    raw_listings = json.load(fp) 

raw_listings[0]

{'neighborhood': 'Fairview',
 'price': 425000,
 'bedrooms': 4,
 'bathrooms': 2,
 'house_size': '2400 sqft',
 'description': 'Nestled in the heart of Fairview, this charming craftsman-style home boasts a warm and inviting atmosphere. The open-concept living area features soaring ceilings, large windows, and a cozy gas fireplace. The sleek and modern kitchen is equipped with high-end appliances and ample storage. The expansive master suite includes a spacious walk-in closet and an en-suite bathroom with separate shower and tub. The fully fenced backyard is perfect for outdoor entertaining and features a covered patio, lush greenery, and a sparkling saltwater pool.',
 'neighborhood_description': 'Fairview is a highly sought-after neighborhood known for its vibrant community, excellent schools, and convenient access to downtown. Residents enjoy a short walk to the local park, playground, and community center. The neighborhood is also close to several popular restaurants, shops, and enterta

In [8]:
#Loading an embedding model from huggingface
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cpu"}  # or "cuda" if GPU is available
)
embedding = embedding_model.embed_query("I am looking for a wonderful house in a cozy neighboorhood")
len(embedding)

768

In [43]:
# -----------------------------------------
# 1.  Transform raw_listings → Documents
# -----------------------------------------
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
import os, json

docs: list[Document] = []

for item in raw_listings:
    # Combine the prose fields into the text that will be embedded
    text_block = "\n\n".join(
        [
            item.get("description", "").strip(),
            item.get("neighborhood_description", "").strip(),
        ]
    ).strip()

    # Everything else becomes structured metadata for filtering/ranking later
    metadata = {k: v for k, v in item.items() if k not in ("description", "neighborhood_description")}

    docs.append(Document(page_content=text_block, metadata=metadata))

print(f"✅ Converted {len(docs)} listings to LangChain Documents")

# -----------------------------------------
# 2.  Create (or reopen) the Chroma store
# -----------------------------------------
PERSIST_DIR = "db/listings_chroma"
os.makedirs(PERSIST_DIR, exist_ok=True)

vector_store = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,      # <-- you said this already exists
    collection_name="listings",
    persist_directory=PERSIST_DIR,
)

vector_store.persist()             # writes the DB files to disk
print(f"💾 Stored & persisted to {PERSIST_DIR}")


✅ Converted 48 listings to LangChain Documents
💾 Stored & persisted to db/listings_chroma


  vector_store.persist()             # writes the DB files to disk


In [46]:
# -------------------------------------------------
# Create a retriever from the existing Chroma store
# -------------------------------------------------
retriever = vector_store.as_retriever(
    search_kwargs={
        "k": 4          # number of listings to return on each query (tweak as you like)
        # you can also add a metadata filter later, e.g.:
        # "filter": {"bedrooms": 3}
    }
)

retriever.invoke("a nice house with 3 bedrooms, 3000 sqft in a cozy neighboorhood that does not cost more than 500000")

[Document(metadata={'price': 425000, 'bathrooms': 3, 'house_size': '2700 sqft', 'neighborhood': 'Riverview', 'bedrooms': 4}, page_content="Welcome to this stunning 4-bedroom, 3-bathroom home located in the desirable Riverview neighborhood. As you step inside, you'll be greeted by the warm and inviting open floor plan, perfect for entertaining guests. The spacious living room features a gas fireplace and large windows that let in an abundance of natural light. The kitchen is equipped with high-end appliances and ample counter space for food preparation. The master bedroom boasts a vaulted ceiling, walk-in closet, and en-suite bathroom with separate shower and tub. The backyard is a tranquil oasis, complete with a private patio and mature trees.\n\nRiverview is a charming neighborhood with a mix of established homes and new developments. Residents enjoy easy access to the nearby riverfront, where they can stroll along the trails or enjoy a picnic. The neighborhood is also within walking 

In [89]:
from pydantic import BaseModel, Field
from typing import Optional, List
from langchain_core.runnables import RunnableLambda

# Defining a pydantic output parser for our query cleaning LLM
class BuyerPreferences(BaseModel):
    bedrooms: Optional[int] = Field(None, description="Number of bedrooms")
    bathrooms: Optional[int] = Field(None, description="Number of bathrooms")
    house_size: Optional[str] = Field(None, description="Desired house size (e.g. '2000 sqft')")
    amenities: Optional[List[str]] = Field(None, description="Desired amenities (e.g. backyard, solar panels)")
    transportation: Optional[List[str]] = Field(None, description="Transportation preferences (e.g. bike paths, public transit)")
    neighborhood_traits: Optional[List[str]] = Field(None, description="Neighborhood qualities (e.g. quiet, walkable)")
    price_range: Optional[str] = Field(None, description="Approximate price range or budget")
    lifestyle: Optional[str] = Field(None, description="Lifestyle fit, e.g. remote work, family-friendly")
    query: str = Field(..., description="One concise summary sentence combining the above preferences, the summary should be optimize for similarity search in avector database")

# Define the parser
cleaning_parser = PydanticOutputParser(pydantic_object=BuyerPreferences)

#wrap the function in a runnableLambda
extract_query = RunnableLambda(lambda prefs: {"input" : prefs.query})

# Bind the parser to the LLM
cleaning_llm = llm_groq.with_structured_output(BuyerPreferences)


In [90]:
#Defining the system Prompt for the cleaning LLM
system_prompt = """
You are a helpful assistant for a real estate matching app.

Your task is to extract the buyer’s home preferences from natural language
and return them as a structured JSON object.

Return ONLY a valid JSON object matching the following fields:

- bedrooms: integer (optional)
- bathrooms: integer (optional)
- house_size: string (e.g., "2000 sqft")
- amenities: array of strings (e.g., ["backyard", "solar panels"])
- transportation: array of strings (e.g., ["bike paths", “public transit”])
- neighborhood_traits: array of strings (e.g., ["quiet", "family-friendly"])
- price_range: string (e.g., "under $500,000")
- lifestyle: string (e.g., "remote work")
- summary: one clear sentence (< 40 words) summarizing all preferences

If the user doesn’t mention a field, set it to null or an empty list (for arrays).
"""

query_cleaner_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt.strip()),
    ("human", "{raw_query}")
])
query_cleaner_prompt.invoke({"raw_query":"a nice house with 3 bedrooms, 3000 sqft in a cozy neighboorhood that does not cost more than 500000"})

ChatPromptValue(messages=[SystemMessage(content='You are a helpful assistant for a real estate matching app.\n\nYour task is to extract the buyer’s home preferences from natural language\nand return them as a structured JSON object.\n\nReturn ONLY a valid JSON object matching the following fields:\n\n- bedrooms: integer (optional)\n- bathrooms: integer (optional)\n- house_size: string (e.g., "2000 sqft")\n- amenities: array of strings (e.g., ["backyard", "solar panels"])\n- transportation: array of strings (e.g., ["bike paths", “public transit”])\n- neighborhood_traits: array of strings (e.g., ["quiet", "family-friendly"])\n- price_range: string (e.g., "under $500,000")\n- lifestyle: string (e.g., "remote work")\n- summary: one clear sentence (< 40 words) summarizing all preferences\n\nIf the user doesn’t mention a field, set it to null or an empty list (for arrays).', additional_kwargs={}, response_metadata={}), HumanMessage(content='a nice house with 3 bedrooms, 3000 sqft in a cozy n

In [91]:
#defining the user query cleaning chain
query_cleaning_chain = query_cleaner_prompt | cleaning_llm | extract_query

#testing the chain
query_cleaning_chain.invoke(
    {"raw_query": "I'd like a modern 3-bedroom around 2000 sqft, solar panels, "
                  "backyard, quiet neighborhood, near public transit. Budget about $600k."}
)

{'input': 'Looking for a modern 3-bedroom house around 2000 sqft with solar panels, a backyard, and located in a quiet neighborhood near public transit. Budget is around $600k.'}

In [92]:
#defining the rag prompt template
from langchain_core.prompts import MessagesPlaceholder

rag_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """You are HomeMatch, an expert real-estate assistant helping buyers find ideal homes based on their preferences.

You will be given:
- A structured summary of the buyer's preferences (in natural language)
- A set of real estate listings (retrieved for semantic similarity)

Your task:
- Recommend the top 3 listings that best align with the buyer's needs
- Highlight the matching features in your explanation (e.g., size, amenities, location)
- Be concise, persuasive, and grounded in the listings provided

Only use information found in the listings. Do not invent properties or add extra features.

"Listings:\n{context}",
"""
),
    ("human", "{input}"),
])


In [None]:
# ✨ NEW imports
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.runnables import RunnableLambda

# 1️⃣  combine the retrieved docs + prompt + model
combine_docs_chain = create_stuff_documents_chain(
    llm=llm_groq,
    prompt=rag_prompt          
)

# 2️⃣  wire the retriever and the doc-combining chain together
rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=combine_docs_chain
)

full_chain = (query_cleaning_chain | rag_chain)


In [97]:
#invocation
results = full_chain.invoke({"raw_query": "I'd like a modern 3-bedroom around 2000 sqft, solar panels, "
                  "backyard, quiet neighborhood, near public transit. Budget about $600k."})
results

{'input': 'A modern 3-bedroom house around 2000 sqft with solar panels, a backyard, and located near public transit in a quiet neighborhood with a budget of about $600k',
 'context': [Document(metadata={'bedrooms': 4, 'price': 1250000, 'bathrooms': 3, 'house_size': '3500 sqft', 'neighborhood': 'Downtown Luxury'}, page_content="Welcome to this stunning, modern townhome located in the heart of Downtown Luxury. With its sleek, industrial-chic design, this property exudes sophistication and elegance. The open-concept living area is perfect for entertaining, with floor-to-ceiling windows offering breathtaking views of the city skyline. The gourmet kitchen features high-end appliances and ample counter space for food preparation. The master suite boasts a spa-like en-suite bathroom and a spacious walk-in closet. Two additional bedrooms and a bonus room offer ample space for relaxation and play. This property is a must-see for anyone looking to live in the epicenter of the city's vibrant cult

In [98]:
#nice display of the results
from IPython.display import display, Markdown

def render_results(results):
    display(Markdown("### 🏡 Top Matching Listings"))
    
    for i, doc in enumerate(results["context"], start=1):
        meta = doc.metadata
        card = f"""
**Listing {i}**
- 📍 Neighborhood: `{meta.get('neighborhood', 'N/A')}`
- 🛏 Bedrooms: `{meta.get('bedrooms', 'N/A')}`
- 🛁 Bathrooms: `{meta.get('bathrooms', 'N/A')}`
- 📐 Size: `{meta.get('house_size', 'N/A')}`
- ☀️ Price: `${meta.get('price', 'N/A'):,}`

---
"""
        display(Markdown(card))

    display(Markdown("### 🤖 AI Summary"))
    display(Markdown(f"> {results['answer']}"))

render_results(results)

### 🏡 Top Matching Listings


**Listing 1**
- 📍 Neighborhood: `Downtown Luxury`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `3`
- 📐 Size: `3500 sqft`
- ☀️ Price: `$1,250,000`

---



**Listing 2**
- 📍 Neighborhood: `Riverview`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `3`
- 📐 Size: `2700 sqft`
- ☀️ Price: `$425,000`

---



**Listing 3**
- 📍 Neighborhood: `Oakwood Hills`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `2`
- 📐 Size: `2500 sqft`
- ☀️ Price: `$425,000`

---



**Listing 4**
- 📍 Neighborhood: `Fairview`
- 🛏 Bedrooms: `4`
- 🛁 Bathrooms: `2`
- 📐 Size: `2400 sqft`
- ☀️ Price: `$425,000`

---


### 🤖 AI Summary

> Based on your preferences for a modern 3-bedroom house around 2000 sqft with solar panels, a backyard, and convenient public transit in a quiet neighborhood,  here are the top 3 recommendations:

1. **Riverview Home:** This 4-bedroom, 3-bathroom home in the desirable Riverview neighborhood comes closest to matching your criteria. While it's slightly larger than 2000 sqft, it offers a spacious living room, gas fireplace, large windows for natural light, high-end appliances, a master suite with a vaulted ceiling, walk-in closet, and en-suite bathroom, and a tranquil backyard with a private patio and mature trees. Riverview is described as a charming neighborhood with a mix of established homes and new developments, offering a balance of community and convenience. 

2. **Oakwood Hills Home:**  This ranch-style home in the sought-after Oakwood Hills neighborhood could be a great fit. It features a spacious open floor plan with a large living room, dining area, and modern kitchen, as well as a master suite with a vaulted ceiling, walk-in closet, and en-suite bathroom.  While the listing doesn't mention solar panels, it does highlight a community pool and playground, suggesting a family-friendly and active neighborhood.

3. **Fairview Craftsman-Style Home:** This charming craftsman home in Fairview boasts a warm and inviting atmosphere with a spacious master suite, walk-in closet, and en-suite bathroom. Although the listing doesn't specify the square footage, the open-concept living area, soaring ceilings, and cozy gas fireplace suggest a comfortable and modern design.  Fairview is known for its vibrant community, excellent schools, and convenient access to downtown. The fully fenced backyard with a covered patio, lush greenery, and a sparkling saltwater pool could be a major selling point.



Remember, these are just suggestions based on the provided information.  

