In [1]:
import re
import json
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import os
import time

# --- Configuration ---
MODEL_NAME = 'all-MiniLM-L6-v2'
RERANKER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
# This script reads the data cleaned from your 'data (1).json' file
CLEANED_DATA_PATH = 'cleaned_properties.json' 
INDEX_PATH = 'property_index_real.faiss'
INDEX_MAP_PATH = 'index_to_prop_id_real.json'

# --- Module 1: Semantic Parser ---

def parse_price(query):
    """Parses price information from the query (e.g., under 60L, above 1.5 cr)."""
    price_info = {}
    price_pattern = re.compile(
        r'(under|below|over|above|around)?\s*([\d\.]+)\s*(l|lakh|cr|crore)', 
        re.IGNORECASE
    )
    match = price_pattern.search(query)
    if match:
        qualifier, amount_str, unit = match.groups()
        amount = float(amount_str)
        if unit.lower() in ['l', 'lakh']: amount *= 100000
        elif unit.lower() in ['cr', 'crore']: amount *= 10000000
        if qualifier and qualifier.lower() in ['under', 'below']: price_info['max'] = int(amount)
        elif qualifier and qualifier.lower() in ['over', 'above']: price_info['min'] = int(amount)
        else: price_info['max'] = int(amount)
    return price_info

def parse_query(query):
    """Parses a natural language query into a structured JSON object."""
    original_query = query
    structured_query = {"filters": {}, "semantic_query": ""}
    bhk_match = re.search(r'(\d+)\s*(bhk|bedroom|bed)', query, re.IGNORECASE)
    if bhk_match:
        structured_query["filters"]["bhk"] = int(bhk_match.group(1))
        query = query.replace(bhk_match.group(0), "", 1)
    property_types = ["apartment", "flat", "villa", "house", "plot"]
    for p_type in property_types:
        if re.search(r'\b' + p_type + r's?\b', query, re.IGNORECASE):
            structured_query["filters"]["property_type"] = p_type
            query = re.sub(r'\b' + p_type + r's?\b', '', query, flags=re.IGNORECASE)
            break
    if re.search(r'\b(rent|rental)\b', query, re.IGNORECASE):
        structured_query["filters"]["status"] = "rent"
        query = re.sub(r'\b(rent|rental)\b', '', query, flags=re.IGNORECASE)
    elif re.search(r'\b(sale|buy)\b', query, re.IGNORECASE):
        structured_query["filters"]["status"] = "sale"
        query = re.sub(r'\b(sale|buy)\b', '', query, flags=re.IGNORECASE)
    price_filter = parse_price(original_query)
    if price_filter:
        structured_query["filters"]["price"] = price_filter
        price_match = re.search(r'(under|below|over|above|around)?\s*([\d\.]+)\s*(l|lakh|cr|crore)', query, re.IGNORECASE)
        if price_match: query = query.replace(price_match.group(0), "", 1)
    semantic_part = ' '.join(query.split())
    stopwords = ["show me", "find", "a", "an", "the", "with", "in", "for", "is", "near", "me"]
    for word in stopwords:
        semantic_part = re.sub(r'\b' + word + r'\b', '', semantic_part, flags=re.IGNORECASE)
    structured_query["semantic_query"] = ' '.join(semantic_part.split()).strip()
    return structured_query

# --- Module 2: Retrieval ---

def index_properties(data_path, model):
    """Creates a FAISS index for semantic search from the property data."""
    with open(data_path, 'r', encoding='utf-8') as f: properties = json.load(f)
    index_to_prop_id, texts_to_embed = {}, []
    for i, prop in enumerate(properties):
        title = prop.get('title') or ''
        description = prop.get('description') or ''
        texts_to_embed.append(f"{title}. {description}")
        index_to_prop_id[i] = prop['property_id']
    embeddings = model.encode(texts_to_embed, convert_to_tensor=False, show_progress_bar=True)
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, INDEX_PATH)
    with open(INDEX_MAP_PATH, 'w') as f: json.dump(index_to_prop_id, f)
    return index, properties, index_to_prop_id

def filter_properties(properties, filters):
    """Applies hard filters (like price, bhk) to the property list."""
    candidates = properties
    if not filters: return candidates
    if 'status' in filters: candidates = [p for p in candidates if p.get('status') == filters['status']]
    if 'bhk' in filters: candidates = [p for p in candidates if p.get('bhk') == filters['bhk']]
    if 'property_type' in filters: candidates = [p for p in candidates if p.get('property_type') == filters['property_type']]
    if 'price' in filters:
        price_filters = filters['price']
        if 'max' in price_filters: candidates = [p for p in candidates if p.get('price') and p.get('price') <= price_filters['max']]
        if 'min' in price_filters: candidates = [p for p in candidates if p.get('price') and p.get('price') >= price_filters['min']]
    return candidates

def semantic_search(query_text, model, index, candidate_ids, index_to_prop_id, top_k=25):
    """Performs semantic search using the FAISS index."""
    if not query_text or not candidate_ids: return list(candidate_ids)[:top_k]
    query_embedding = model.encode([query_text])
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k=index.ntotal)
    ranked_prop_ids = []
    for i in indices[0]:
        prop_id = index_to_prop_id.get(str(i))
        if prop_id in candidate_ids: ranked_prop_ids.append(prop_id)
    return ranked_prop_ids[:top_k]

def retrieve(parsed_query, model, index, all_properties, index_to_prop_id):
    """Orchestrates the filtering and semantic search process."""
    candidate_properties = filter_properties(all_properties, parsed_query.get('filters'))
    candidate_ids = {p['property_id'] for p in candidate_properties}
    semantic_query = parsed_query.get('semantic_query')
    final_ranked_ids = semantic_search(semantic_query, model, index, candidate_ids, index_to_prop_id)
    return final_ranked_ids

# --- Module 3: Reranker ---

def rerank_properties(original_query, retrieved_ids, all_properties, model):
    """Refines the search results using a more powerful cross-encoder model."""
    if not retrieved_ids: return []
    properties_map = {p['property_id']: p for p in all_properties}
    pairs = []
    for prop_id in retrieved_ids:
        prop = properties_map.get(prop_id)
        if prop: 
            title = prop.get('title') or ''
            description = prop.get('description') or ''
            pairs.append([original_query, f"{title}. {description}"])
    if not pairs: return []
    scores = model.predict(pairs, show_progress_bar=True)
    id_score_pairs = sorted(list(zip(retrieved_ids, scores)), key=lambda x: x[1], reverse=True)
    return [pair[0] for pair in id_score_pairs]

# --- Main Search Pipeline ---

class SearchEngine:
    """A class that encapsulates the entire search pipeline."""
    def __init__(self):
        print("Initializing search engine...")
        self.retrieval_model = SentenceTransformer(MODEL_NAME)
        self.reranker_model = CrossEncoder(RERANKER_MODEL_NAME)
        
        if not os.path.exists(INDEX_PATH) or not os.path.exists(CLEANED_DATA_PATH):
            print("ERROR: No index or data file found. Please run the data transformation script first.")
            self.properties = []
            return
        
        print("Loading existing index and data...")
        self.index = faiss.read_index(INDEX_PATH)
        with open(CLEANED_DATA_PATH, 'r', encoding='utf-8') as f: self.properties = json.load(f)
        with open(INDEX_MAP_PATH, 'r') as f: self.index_to_prop_id = json.load(f)
        print("Search engine ready.")

    def search(self, query):
        """Performs an end-to-end search for a given query."""
        if not self.properties:
            print("Search engine not initialized. Please run the data cleaner and restart.")
            return

        print(f"\n\n{'='*50}\n--- Executing Search for: '{query}' ---\n{'='*50}")
        
        # 1. Parse Query
        parsed_query = parse_query(query)
        print(f"\n1. Parsed Query:\n{json.dumps(parsed_query, indent=2)}")
        
        # 2. Retrieve Properties
        retrieved_ids = retrieve(parsed_query, self.retrieval_model, self.index, self.properties, self.index_to_prop_id)
        print(f"\n2. Retrieved IDs (before reranking): {len(retrieved_ids)} candidates found.")

        # 3. Rerank Properties
        final_ids = rerank_properties(query, retrieved_ids, self.properties, self.reranker_model)
        print(f"\n3. Reranked IDs: {len(final_ids)} candidates after reranking.")
        
        # 4. Fetch and display final results
        print("\n--- Top 5 Search Results ---")
        if not final_ids:
            print("No matching properties found.")
            return

        final_results = [p for p in sorted([p for p in self.properties if p['property_id'] in final_ids], key=lambda x: final_ids.index(x['property_id']))]
        for i, prop in enumerate(final_results[:5]):
            print(f"\n{i+1}. {prop['title']}")
            print(f"   Price: \u20b9{prop.get('price', 0):,}")
            print(f"   Location: {prop.get('location', {}).get('locality', 'N/A')}, {prop.get('location', {}).get('city', 'N/A')}")
            print(f"   URL: {prop['source_url']}")
        
        return final_results

if __name__ == '__main__':
    # --- IMPORTANT ---
    # To run this:
    # 1. First, run the 'data_cleaner_script_v1.py' to generate 'cleaned_properties.json'.
    # 2. Then, run this script. It will create the index on the first run.
    # 3. On subsequent runs, it will load the existing index and be ready to search.
    
    if not os.path.exists(CLEANED_DATA_PATH):
        print("ERROR: 'cleaned_properties.json' not found. Please run the data transformation script first.")
    else:
        if not os.path.exists(INDEX_PATH):
            print("Creating index from scraped data for the first time... (This may take a while)")
            retrieval_model = SentenceTransformer(MODEL_NAME)
            index_properties(CLEANED_DATA_PATH, retrieval_model)
            print("Index created successfully.")
    
        engine = SearchEngine()
        if engine.properties:
            engine.search("3 bhk flat for sale in Gurgaon under 2 crore")
            engine.search("show me a luxury apartment in a good society")
            engine.search("a house near a metro station")


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [2]:
pip install tf-keras


Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 13.4 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.19.0
Note: you may need to restart the kernel to use updated packages.
