In [None]:
### NOTEBOOK 4: 04_property_classification.ipynb ###
#
# GOAL: To load the 'bandung_housing_FINAL.csv' master file
#       and create the definitive 'property_type' column.
#       This will use the 3-Stage Hybrid Classification logic.
#
# INPUT: bandung_housing_FINAL.csv
# OUTPUT: bandung_housing_CLASSIFIED.csv
#

import pandas as pd
import numpy as np
import json
import sys
from pathlib import Path
from tqdm import tqdm

# Setup (tqdm pandas)
tqdm.pandas()

# Set display options for full exploration
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

print("--- 04_property_classification.ipynb ---")

In [None]:
# ---
# ## Step 1: Load the Master Dataset
# ---

print("\nStep 1: Loading the master 'bandung_housing_FINAL.csv' file...")

# Path Definitions
PROJECT_ROOT = Path(r"..")
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

# --- INPUT FILE (From Notebook 3) ---
MASTER_FILE_PATH = PROCESSED_DIR / "bandung_housing_FINAL.csv"

# --- OUTPUT FILE (Failsafe for Notebook 5) ---
CLASSIFIED_FILE_PATH = PROCESSED_DIR / "bandung_housing_CLASSIFIED.csv"

try:
    # Set data types for 'id' and 'zipcode' to avoid warnings
    dtypes = {
        'id': 'str',
        'zipcode': 'str',
        'geo_confidence': 'str'
    }
    
    df_master = pd.read_csv(MASTER_FILE_PATH, dtype=dtypes)
    print(f"Successfully loaded {MASTER_FILE_PATH}")
    print(f"Total listings loaded: {len(df_master):,}")

except FileNotFoundError:
    print(f"❌ ERROR: File not found at {MASTER_FILE_PATH}")
    print("Please make sure Notebooks 1, 2, and 3 have been run successfully.")
    sys.exit(1) # Stop the script
except Exception as e:
    print(f"❌ ERROR: Could not load file. {e}")
    sys.exit(1) # Stop the script

In [None]:
df_master.info()

In [None]:
# ---
# ## Step 2: 3-Stage Hybrid Classification (CORRECTED)
# ---

print("\nStep 2: Starting 3-Stage 'property_type' classification...")

# ================================================================
# STAGE 1 & 2 FUNCTION: "Smart" JSON parsing with "Dumb" Fallback
# ================================================================

def get_initial_type_CORRECTED(row):
    """
    Tries to get 'Tipe Properti' from 'specs' JSON (Stage 1).
    If it fails, falls back to a SAFER keyword search (Stage 2).
    """
    
    # --- STAGE 1: "Smart" JSON Parsing ---
    try:
        spec_data = json.loads(row['specs'])
        if 'Tipe Properti' in spec_data:
            tipe = spec_data['Tipe Properti'].lower()
            if 'rumah' in tipe: return "Rumah"
            if 'tanah' in tipe: return "Tanah"
            if 'apartemen' in tipe: return "Apartemen"
            if 'ruko' in tipe: return "Ruko"
            if 'villa' in tipe: return "Villa"
            if any(k in tipe for k in ['kantor', 'gudang']): return "Ruko"
    except:
        pass # Pass to Stage 2

    # --- STAGE 2: "Dumb" Keyword Fallback (CORRECTED) ---
    search_string = ""
    for col in ['id', 'description', 'master_address', 'specs']:
        if col in row.index and isinstance(row[col], str):
            search_string += " " + row[col].lower()
    
    # --- THIS IS THE FIX ---
    # We check for "Tanah" using *specific phrases* first,
    # not the general (and poisonous) word "tanah".
    if any(s in search_string for s in ['jual kavling', 'rumah hitung tanah', 'dijual tanah', 'tanah dijual']):
        return "Tanah"
    # --- END FIX ---
    
    # Priority List (The rest is the same)
    if 'kavling' in search_string: # 'kavling' is specific enough to keep
        return "Tanah"
    if any(s in search_string for s in ['apartemen', 'apartment', 'apartement']):
        return "Apartemen"
    if any(s in search_string for s in ['ruko', 'rukan', 'kantor', 'office', 'gudang', 'warehouse']):
        return "Ruko"
    if 'villa' in search_string:
        return "Villa"
    if any(s in search_string for s in ['rumah', 'house', 'hunian', 'cluster', 'residence']):
        return "Rumah"
    
    return "Lainnya"

# --- Apply Stage 1 and 2 ---
print("Running Stage 1 (JSON) and Stage 2 (Keyword) classification...")
print("This may take a minute...")
df_master['property_type'] = df_master.progress_apply(get_initial_type_CORRECTED, axis=1)
print("Stage 1 & 2 complete.")

In [None]:
# ================================================================
# STAGE 3: "Symptom" Polish (The Final Fix)
# ================================================================
print("\nRunning Stage 3 ('Symptom') polishing...")

# Symptom 1: No building size = 'Tanah'
symptom_1_count = len(df_master[
    (df_master['property_type'] == 'Rumah') &
    (df_master['building_size_sqm'].isnull())
])
df_master.loc[
    (df_master['property_type'] == 'Rumah') & 
    (df_master['building_size_sqm'].isnull()), 
    'property_type'
] = 'Tanah'
print(f" - Stage 3: Re-classified {symptom_1_count:,} 'Rumah' listings with null 'building_size' as 'Tanah'.")

# Symptom 2: No bedrooms = 'Ruko' / 'Kantor'
symptom_2_count = len(df_master[
    (df_master['property_type'] == 'Rumah') &
    (df_master['bedrooms'].isnull())
])
df_master.loc[
    (df_master['property_type'] == 'Rumah') & 
    (df_master['bedrooms'].isnull()), 
    'property_type'
] = 'Ruko' # Assume 'Ruko' (Commercial)
print(f" - Stage 3: Re-classified {symptom_2_count:,} 'Rumah' listings with null 'bedrooms' as 'Ruko'.")

# Symptom 3: No bathrooms = 'Tanah'
symptom_3_count = len(df_master[
    (df_master['property_type'] == 'Rumah') &
    (df_master['bathrooms'].isnull())
])
df_master.loc[
    (df_master['property_type'] == 'Rumah') & 
    (df_master['bathrooms'].isnull()), 
    'property_type'
] = 'Tanah' # Assume 'Tanah'
print(f" - Stage 3: Re-classified {symptom_3_count:,} 'Rumah' listings with null 'bathrooms' as 'Tanah'.")

print("\n3-Stage Classification complete.")

In [None]:
# ---
# ## Step 3: Exploration (CRITICAL CHECK)
# ---

print("\n" + "="*40)
print("Step 3: Final Classification Results")
print("="*40)
print("This is the final breakdown of all property types in the dataset:")

# This is our critical check
print(df_master['property_type'].value_counts())


# ---
# ## Step 4: Save Classified Failsafe File
# ---

print("\nStep 4: Saving new failsafe file...")
try:
    # We save the file with the new 'property_type' column
    df_master.to_csv(CLASSIFIED_FILE_PATH, index=False)
    print(f"\n✅✅✅ 04_property_classification.ipynb COMPLETE! ✅✅✅")
    print(f"New failsafe file saved to:")
    print(CLASSIFIED_FILE_PATH)
    print(f"Total listings saved: {len(df_master):,}")

except Exception as e:
    print(f"❌ ERROR: Could not save the file. {e}")