In [1]:
import json
import re
import time

def clean_floor_size(size_str):
    """
    Cleans the floor size string like "2350 FTK " to an integer.
    """
    if not isinstance(size_str, str):
        return None
    
    # Use regex to find the first sequence of digits
    numbers = re.findall(r'\d+', size_str.replace(',', ''))
    if numbers:
        return int(numbers[0])
    return None

def transform_data(input_path='data (1).json', output_path='cleaned_properties.json'):
    """
    Reads the scraped data, transforms it to match our search engine's schema,
    and saves it to a new file.
    """
    print(f"Reading data from {input_path}...")
    
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file {input_path} was not found. Please make sure it's in the same directory.")
        return
        
    properties_raw = raw_data.get("property", [])
    cleaned_properties = []
    
    print(f"Found {len(properties_raw)} properties. Transforming...")

    for prop in properties_raw:
        # Safely get nested dictionary
        flat_details = prop.get('flatDetails', {})
        location_details = prop.get('location', {})

        # Create a new dictionary in the format our engine expects
        cleaned_prop = {
            "property_id": prop.get('id'),
            "source_url": prop.get('url'),
            "title": prop.get('name'),
            "description": prop.get('description'),
            "property_type": "apartment", # Defaulting, can be refined later
            "status": "sale", # All data seems to be for sale
            "location": {
                "city": location_details.get('cityName'),
                "locality": location_details.get('addressLocality'),
                "sub_locality": None,
                "latitude": float(location_details['latitude']) if location_details.get('latitude') else None,
                "longitude": float(location_details['longitude']) if location_details.get('longitude') else None
            },
            "bhk": int(flat_details['bedroom']) if flat_details.get('bedroom', '').isdigit() else 0,
            "bathrooms": int(flat_details['bathroom']) if flat_details.get('bathroom', '').isdigit() else 0,
            "area_sqft": clean_floor_size(flat_details.get('floorSize')),
            "furnishing": flat_details.get('furnshingstatus', 'Unfurnished').lower(),
            "facing": None, # This data was not in the source file
            "price": int(prop['price']) if prop.get('price', '').isdigit() else None,
            "rent": None,
            "maintenance_monthly": None,
            "amenities": [],
            "is_pet_friendly": None,
            "floor_number": int(flat_details['floorno']) if flat_details.get('floorno', '').isdigit() else None,
            "total_floors": None,
            "image_urls": [],
            "description_embedding": None, # This will be generated later
            "scraped_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "posted_on": None
        }
        
        # Only add properties that have essential information
        if cleaned_prop['title'] and cleaned_prop['price']:
            cleaned_properties.append(cleaned_prop)

    print(f"Successfully cleaned {len(cleaned_properties)} properties.")
    
    # Save the cleaned data to a new file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(cleaned_properties, f, indent=4)
        
    print(f"Cleaned data saved to {output_path}")

if __name__ == '__main__':
    transform_data()


Reading data from data (1).json...
Found 57423 properties. Transforming...
Successfully cleaned 57247 properties.
Cleaned data saved to cleaned_properties.json
