In [None]:
### NOTEBOOK 5: 05_outlier_removal.ipynb ###
#
# GOAL: To filter the classified dataset for 'Rumah' listings
#       and then systematically remove geographic, logical,
#       and statistical outliers.
#
# INPUT: bandung_housing_CLASSIFIED.csv
# OUTPUT: bandung_housing_MODEL_READY.csv
#

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Set display options for full exploration
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

print("--- 05_outlier_removal.ipynb ---")

In [None]:
# ---
# ## Step 1: Load and Filter for 'Rumah'
# ---

print("Step 1: Loading 'bandung_housing_CLASSIFIED.csv'...")

# Path Definitions
PROJECT_ROOT = Path(r"..")
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

# --- INPUT FILE (From Notebook 4) ---
CLASSIFIED_FILE_PATH = PROCESSED_DIR / "bandung_housing_CLASSIFIED.csv"

# --- OUTPUT FILE (The final product) ---
MODEL_READY_PATH = PROCESSED_DIR / "bandung_housing_MODEL_READY.csv"

try:
    # Set data types for 'id' and 'zipcode' to avoid warnings
    dtypes = {
        'id': 'str',
        'zipcode': 'str',
        'geo_confidence': 'str'
    }
    
    df_classified = pd.read_csv(CLASSIFIED_FILE_PATH, dtype=dtypes)
    print(f"Successfully loaded {len(df_classified):,} total listings.")

except FileNotFoundError:
    print(f"❌ ERROR: File not found at {CLASSIFIED_FILE_PATH}")
    print("Please make sure Notebook 4 ran successfully.")
except Exception as e:
    print(f"❌ ERROR: Could not load file. {e}")


# --- Filter for 'Rumah' ---
print("Filtering for 'property_type' == 'Rumah'...")

df_platform_a = df_classified[df_classified['property_type'] == 'Rumah'].copy()

print(f"Created 'df_platform_a' with {len(df_platform_a):,} listings.")
print("--- Step 1 Complete ---")

In [None]:
df_platform_a.info()

132 listings (8073 - 7941) are missing their latitude and longitude. They represent a very small fraction of our 'Rumah' data: (132 / 8073) * 100 = 1.6%

In [None]:
# ---
# Exploration (Find Missing Coordinates)
# ---

print("Inspecting listings with missing coordinates...")

# Filter for rows where latitude is null
missing_coords_df = df_platform_a[df_platform_a['latitude'].isnull()]

print(f"\nFound {len(missing_coords_df)} listings with missing coordinates.")

# Display key columns of these listings to understand what they are
print("\nSample of listings with missing coordinates:")
print(missing_coords_df[[
    'id', 
    'source', 
    'price', 
    'master_address', 
    'bedrooms', 
    'bathrooms'
]].head(10))

# You can also check if they were missing a master_address
print(f"\nOf these, {missing_coords_df['master_address'].isnull().sum()} listings were ALSO missing a 'master_address'.")

these are likely the first 100 in Platform A listings, which didn't have an address or lat/long, even in the initial scraping. 

In [None]:
# ---
# ## Step 2: Hapus Listings Tanpa Koordinat
# ---

print("Memulai Step 2: Menghapus listings tanpa data geospasial...")

rows_before = len(df_platform_a)
print(f"Listings 'Rumah' (sebelum): {rows_before:,}")

# Solusi: Timpa df_platform_a dengan versi baru yang telah di-drop
# Kita men-drop baris di mana 'latitude' adalah NaT/NaN
df_platform_a = df_platform_a.dropna(subset=['latitude'])

rows_after = len(df_platform_a)
print(f"Listings 'Rumah' (setelah): {rows_after:,}")
print(f"Total {rows_before - rows_after} listings 'hantu' telah dihapus.")

print("\n--- Step 2 Selesai ---")

In [None]:
df_platform_a.info()

In [None]:
df_platform_a.describe()

**Geographic Impossibilities**  
**The Problem:**  
- Latitude (max: 0.29) and longitude (max: 115.19) are incorrect.  
- Bandung’s coordinates are roughly (-6.9, 107.6).  
- A longitude of 115 is in Bali, and a latitude of 0.29 is on the equator.  
- These are *bad geocodes*.  

**Diagnosis:**  
- The dataset (`df_platform_a`) contains listings that are not located in Bandung.  

---

**Logical Impossibilities (Typos)**  
**The Problem:**  
- `land_size_sqm` (min: 1.00)  
- `building_size_sqm` (min: 1.00)  
- `price` (min: 2,490,000.00)  

**Diagnosis:**  
- A 1 sqm house or a 2.4M IDR house is unrealistic.  
- These values are typos or bad data entries that will skew the model.  

---

**Mis-classified Properties**  
**The Problem:**  
- `bedrooms` (max: 132.00)  
- `bathrooms` (max: 142.00)  
- `price` (max: 950,000,000,000.00)  

**Diagnosis:**  
- These are not typical houses (*Rumah*).  
- They are likely hotels or large *Kos-kosan* (boarding houses) that the 3-stage classification logic failed to filter.  
- The extreme price values also suggest mis-classified properties.  


In [None]:
# Define Bandung's reasonable geographic boundaries
MIN_LATITUDE = -7.3
MAX_LATITUDE = -6.5
MIN_LONGITUDE = 107.0
MAX_LONGITUDE = 107.9

print(f"Batas 'Bandung' (Biru) yang digunakan:")
print(f"  Latitude: {MIN_LATITUDE} sampai {MAX_LATITUDE}")
print(f"  Longitude: {MIN_LONGITUDE} sampai {MAX_LONGITUDE}")

def classify_location(row):
    """
    Classifies a row as 'Bandung' or 'Outlier' based on its coordinates.
    """
    is_lat_ok = (row['latitude'] >= MIN_LATITUDE) and (row['latitude'] <= MAX_LATITUDE)
    is_lon_ok = (row['longitude'] >= MIN_LONGITUDE) and (row['longitude'] <= MAX_LONGITUDE)
    
    if is_lat_ok and is_lon_ok:
        return 'Bandung'
    else:
        return 'Outlier'

# 1. Create the new column for coloring
df_platform_a['location_type'] = df_platform_a.apply(classify_location, axis=1)

# 2. Check the counts
print("\n" + df_platform_a['location_type'].value_counts().to_string())

# 3. Create the plot
print("Generating plot...")
plt.figure(figsize=(12, 10))

# Use seaborn.scatterplot with the 'hue' parameter
# 'hue' tells seaborn to color points based on the 'location_type' column
# 'palette' maps our categories to the specific colors you wanted
ax = sns.scatterplot(
    data=df_platform_a,
    x='longitude', 
    y='latitude',
    hue='location_type',         # <-- This colors the dots
    palette={'Bandung': 'blue', 'Outlier': 'red'}, # <-- Your color choices
    alpha=0.6,                   # A bit of transparency
    s=20                         # Point size
)

ax.set_title('Geographic Diagnosis Plot (Color-Coded): Bandung vs. Outliers', fontsize=16)
ax.set_xlabel('Longitude', fontsize=12)
ax.set_ylabel('Latitude', fontsize=12)
ax.grid(True)
plt.legend(title='Lokasi') # Add a legend
plt.show()

# 4. Clean up the temporary column (optional, but good practice)
# df_platform_a = df_platform_a.drop(columns=['location_type'])

In [None]:
# ---
# Remove Geographic Outliers 
# ---

rows_before = len(df_platform_a)
print(f"Listings 'Rumah' (sebelum filter): {rows_before:,}")

# 1. Find the indices of the rows we want to drop
outlier_indices = df_platform_a[df_platform_a['location_type'] != 'Bandung'].index
print(f"Menemukan {len(outlier_indices)} listings 'Outlier' untuk dihapus.")

# 2. Drop those rows from df_platform_a *in place*
df_platform_a.drop(outlier_indices, inplace=True)

# 3. Drop the temporary column *in place*
df_platform_a.drop(columns=['location_type'], inplace=True)

rows_after = len(df_platform_a)
print(f"Listings 'Rumah' (setelah filter): {rows_after:,}")
print(f"Total {rows_before - rows_after} listings 'Outlier' telah dihapus.")

In [None]:
df_platform_a.describe()

In [None]:
# ---
# Diagnose Bedroom Outliers 
# ---

# --- 1. Define Thresholds ---
# a "normal" house has 1-14 bedrooms.
MIN_BEDROOMS_NORMAL = 1
MAX_BEDROOMS_NORMAL = 15 # We'll say 15 or more is an outlier

# --- 2. Create the Classification Column ---
def classify_bedrooms(bedrooms):
    if bedrooms < MIN_BEDROOMS_NORMAL:
        return 'Outlier (Low)'
    if bedrooms >= MAX_BEDROOMS_NORMAL:
        return 'Outlier (High)'
    return 'Normal'

# Apply the function to create the new 'hue' column
df_platform_a['bedroom_type'] = df_platform_a['bedrooms'].apply(classify_bedrooms)

print(f"\nClassification counts:\n{df_platform_a['bedroom_type'].value_counts().to_string()}")

# --- 3. Create the Color-Coded Plot ---
print("Generating plot...")
plt.figure(figsize=(15, 7))

# Create a countplot, using 'bedroom_type' to color the bars
ax = sns.countplot(
    data=df_platform_a,
    x='bedrooms',
    hue='bedroom_type',
    # Use your requested colors
    palette={'Normal': 'lightgreen', 'Outlier (High)': 'orange', 'Outlier (Low)': 'gray'},
    dodge=False # This makes the bars stack/overlap, not dodge
)

# --- This is the key to making the plot readable ---
# We limit the x-axis to see the main distribution
ax.set_xlim(-0.5, 20.5) 
ax.set_xticks(range(0, 21)) # Set ticks for every bedroom 0-20

ax.set_title('Diagnosis: Bedroom Count Outliers (0-20 Range)', fontsize=16)
ax.set_xlabel('Number of Bedrooms', fontsize=12)
ax.set_ylabel('Count of Listings', fontsize=12)
plt.legend(title='Category')
plt.show()

# --- 4. Show the "Outlier (High)" Table ---
# The plot shows the 'Normal' range. 
# The table shows the 'Outlier (High)' listings (the ones we need to remove).
print("\n" + "="*50)
print(f"Inspecting the 'Outlier (High)' listings (>= {MAX_BEDROOMS_NORMAL} bedrooms):")

outlier_df = df_platform_a[df_platform_a['bedroom_type'] == 'Outlier (High)']
print(outlier_df.sort_values(by='bedrooms', ascending=False)[
    ['id', 'master_address', 'price', 'bedrooms', 'bathrooms', 'building_size_sqm']
])

# Clean up the temporary column
df_platform_a = df_platform_a.drop(columns=['bedroom_type'])

In [None]:
#  Filter for the "Outlier (Low)" listings ---
MIN_BEDROOMS_NORMAL = 1
MAX_BEDROOMS_NORMAL = 15 

def classify_bedrooms(bedrooms):
    if bedrooms < MIN_BEDROOMS_NORMAL:
        return 'Outlier (Low)'
    if bedrooms >= MAX_BEDROOMS_NORMAL:
        return 'Outlier (High)'
    return 'Normal'

df_platform_a['bedroom_type'] = df_platform_a['bedrooms'].apply(classify_bedrooms)

# --- 2. Filter for the "Outlier (Low)" listings ---
low_outliers_df = df_platform_a[df_platform_a['bedroom_type'] == 'Outlier (Low)'].copy()

# --- 4. Clean up the temporary column ---
df_platform_a = df_platform_a.drop(columns=['bedroom_type'])

In [None]:
low_outliers_df

## List of Anomalies 

**Persistent Geographic Issues**  
Our "Bandung" filter is technically a coordinate filter. As you can see, it still includes areas that are not administratively part of Bandung City. 
Examples:  
- Listing #228: Padalarang District, West Bandung Regency  
- Listing #809: Sariwangi, Parongpong, West Bandung Regency  
- Listing #4194: Cimahi, Cimahi Tengah, Cimahi City  

---

**Clear Misclassifications (Hotels & Land)**  
`bedrooms = 0` should indicate *Land* or *Commercial Property*, but these listings were still classified as *Houses*.  

Examples:  
- Listing #5670: id = dijual-hotel-di-bandung... description starts with "Dijual ho..." → clearly a Hotel  
- Listing #7201: id = hotel-bandung... → clearly a Hotel  
- Listing #4631: id = tanah-mainroad-setiabudi... description starts with "Tanah Mainroad..." → clearly Land  

---

**Commercial Properties (Shophouses/Business Use)**  
Many listings (also with 0 bedrooms) have descriptions that strongly suggest they are commercial properties, not residential houses.  

Examples:  
- Listing #1194: "House Can Be Used For Cafe Business..."  
- Listing #3578: "Residential House With Business Space..."  
- Listing #803: "House On Mainroad..." (usually commercial)  

---

**"0 Bedrooms, 0 Bathrooms" Issue**  
There are 37 listings with both `bedrooms = 0` and `bathrooms = 0`.  

Examples: #689, #809, #1062, #1194, #1335, #3019, #4111, #4631, #4798, #5670, #5757, #5888, #5951, #6290, #6422, #6748, #6982, #6992, #7201.  

Note: Listings with 0 bedrooms **and** 0 bathrooms are almost certainly not residential houses.  


In [None]:
# ---
# Remove Persistent Geographic Outliers (Shapefile Solution)
# ---

import geopandas as gpd
from shapely.geometry import Point

# --- 1. Define Shapefile Path ---
# This is the path from your "last file.docx" (IPYNB FILE # 19)
shapefile_path = r"..\data\raw\idn_admbnda_adm4_ID3_bps_20200401.shp"
print(f"Menggunakan shapefile dari: {shapefile_path}")

# --- 2. Load the Shapefile ---
try:
    gdf_adm = gpd.read_file(shapefile_path)
    print(f"Shapefile berhasil dimuat, berisi {len(gdf_adm)} total area administrasi.")
except Exception as e:
    print(f"❌ ERROR: Tidak dapat memuat shapefile. Periksa path Anda.")
    print(f"Error: {e}")
    # Stop execution if the file can't be loaded
    raise e

# --- 3. Convert 'df_platform_a' to a GeoDataFrame ---
print("Mengubah 'df_platform_a' (listings) menjadi GeoDataFrame...")
# Create 'geometry' from longitude and latitude
gdf_listings = gpd.GeoDataFrame(
    df_platform_a, 
    geometry=gpd.points_from_xy(df_platform_a.longitude, df_platform_a.latitude)
)
# Set the coordinate reference system (CRS) to standard WGS84 (lat/lon)
gdf_listings.set_crs("EPSG:4326", inplace=True) 
print(f"Total listings 'Rumah' sebelum spatial join: {len(gdf_listings):,}")

# --- 4. Align Coordinate Systems (CRS) ---
print("Menyamakan Coordinate Systems (CRS)...")
# Ensure both GeoDataFrames use the same CRS before joining
if gdf_listings.crs != gdf_adm.crs:
    gdf_listings = gdf_listings.to_crs(gdf_adm.crs)
print("CRS aligned.")

# --- 5. Perform the Spatial Join (The Filter) ---
print("Melakukan spatial join (filter) untuk memetakan listings ke area...")
# 'how="left"' keeps all listings, 'predicate="within"' checks which polygon they fall into
# This adds columns from gdf_adm (like 'ADM2_EN') to our listings
listings_with_adm = gpd.sjoin(gdf_listings, gdf_adm, how="left", predicate="within")

# --- 6. Filter to 'Kota Bandung' ---
admin_city_column = 'ADM2_EN' # Based on your file
city_name = 'Kota Bandung'

print(f"Filtering listings yang berada di dalam '{city_name}'...")
df_kota_bandung = listings_with_adm[
    listings_with_adm[admin_city_column] == city_name
].copy()

print(f"Ditemukan {len(df_kota_bandung):,} listings di dalam '{city_name}'.")

# --- 7. Clean up and Finalize ---
# We update df_platform_a to be this new, filtered DataFrame.
# We also drop the extra 'geometry' and 'index_right' columns.
columns_to_drop = ['index_right', 'geometry']
df_platform_a = df_kota_bandung.drop(columns=columns_to_drop, errors='ignore')

# Keep only the original columns + the new sub-district column
original_cols = list(df_platform_a.columns)
# Remove any new admin columns we don't want, except 'ADM4_EN'
admin_cols_to_keep = ['ADM4_EN'] # This is the sub-district name
all_cols_to_keep = [col for col in original_cols if col in df_platform_a.columns] + admin_cols_to_keep
# Ensure no duplicates
final_cols = []
for col in all_cols_to_keep:
    if col not in final_cols and col in df_platform_a.columns:
        final_cols.append(col)

df_platform_a = df_platform_a[final_cols]

print(f"\nTotal listings 'Rumah' SETELAH shapefile filter: {len(df_platform_a):,}")
print("--- Step Selesai ---")

In [None]:
df_platform_a.describe()

In [None]:
# ---
# Membersihkan Kolom & Mendiagnosis Ulang Outlier Rendah
# ---

columns_to_drop = [
    'date', 
    'validOn', 
    'validTo', 
    'Shape_Leng', 
    'Shape_Area'
]

# Kita juga akan memeriksa kolom lain yang mungkin ditambahkan oleh join
# (ADM1_EN, ADM2_EN, ADM3_EN, dll) kecuali ADM4_EN
# Cek kolom yang ada sebelum mencoba membuangnya
existing_cols_to_drop = [col for col in columns_to_drop if col in df_platform_a.columns]
if 'ADM2_EN' in df_platform_a.columns:
    existing_cols_to_drop.append('ADM2_EN') # Kita sudah tahu ini 'Kota Bandung'
if 'ADM3_EN' in df_platform_a.columns:
     existing_cols_to_drop.append('ADM3_EN')

print(f"Kolom yang akan dibuang: {existing_cols_to_drop}")

df_platform_a = df_platform_a.drop(columns=existing_cols_to_drop)

print("Pembersihan kolom selesai.")
print("Kolom 'ADM4_EN' (Kecamatan) dipertahankan untuk analisis fitur.")


In [None]:
# Menampilkan 'low_outliers_df' Saat Ini ---
print("\nMemulai Tugas 2: Mengisolasi outlier kamar tidur rendah (dari dalam Kota Bandung)...")

# Filter 'df_platform_a' yang baru (6.009 listing) untuk menemukan
# listing dengan kamar tidur < 1
MIN_BEDROOMS_NORMAL = 1
low_outliers_df = df_platform_a[df_platform_a['bedrooms'] < MIN_BEDROOMS_NORMAL].copy()

print(f"\nMenampilkan semua {len(low_outliers_df)} listing 'Outlier (Rendah)' di dalam Kota Bandung:")

# Menampilkan seluruh DataFrame (kita sudah mengatur opsi pd.display)
print(low_outliers_df[[
    'id', 
    'source', 
    'price', 
    'master_address',
    'ADM4_EN', # Tampilkan kolom kecamatan baru kita!
    'bedrooms', 
    'bathrooms', 
    'land_size_sqm', 
    'building_size_sqm'
]])

In [None]:
low_outliers_df

**List of Anomalies (Post-Shapefile Filter)**  
This list summarizes the remaining data quality issues *after* filtering for listings inside **Kota Bandung**.  

---

**1. Clear Misclassifications (Hotels & Land)**  
The `property_type = 'Rumah'` classification is not perfect. Some listings are clearly not houses, often indicated by `bedrooms = 0`.  

**Examples:**  
- **Hotels:** Listing #5670 (`id = dijual-hotel-di-bandung...`) and #7201 (`id = hotel-bandung...`) are clearly hotels.  
- **Land:** Listing #4631 (`id = tanah-mainroad-setiabudi...`) is clearly land.  

---

**2. Commercial Properties (Shophouses/Business Use)**  
Many listings (also often with 0 bedrooms) have descriptions that strongly suggest they are for commercial, not residential, use.  

**Examples:**  
- Listing #1194: "House Can Be Used For Cafe Business..."  
- Listing #3578: "Residential House With Business Space..."  
- Listing #803: "House On Mainroad..." (usually commercial)  

---

**3. Logical Impossibilities (Size Typos)**  
The data contains "impossible" minimum values that are typos.  
*(Note: `building_size_sqm > land_size_sqm` is valid and indicates a multi-story home).*  

**The Real Impossibility:**  
- **Typos:** Listing #4563 shows `building_size_sqm = 1.00`, which is impossible.  
- **General:** Any listing with `land_size_sqm` or `building_size_sqm` under a "common sense" threshold (e.g., < 20 sqm) is highly suspect.  

---

**4. "0 Bedrooms AND 0 Bathrooms" Issue**  
A significant number of the `bedrooms = 0` listings also have `bathrooms = 0`.  

**Note:** Listings with 0 bedrooms **and** 0 bathrooms are almost certainly not residential houses and are strong candidates for re-classification or removal.  


In [None]:
# ---
# Clean Shapefile Columns
# ---

print("Membersihkan kolom-kolom shapefile yang tidak perlu...")

# Daftar kolom yang tidak berguna dari output .describe() Anda
columns_to_drop = [
    'date', 
    'validOn', 
    'validTo', 
    'Shape_Leng', 
    'Shape_Area',
    # Juga buang kolom PCODE (kode pos) dan nama Admin
    # yang tidak kita perlukan, KECUALI ADM4_EN (kecamatan)
    'ADM4_PCODE', 'ADM4_REF', 'ADM4ALT1EN', 'ADM4ALT2EN',
    'ADM3_PCODE', 'ADM2_PCODE', 'ADM1_EN', 'ADM1_PCODE',
    'ADM0_EN', 'ADM0_PCODE'
]

# Cek kolom yang ada sebelum mencoba membuangnya
existing_cols_to_drop = [col for col in columns_to_drop if col in df_platform_a.columns]

if existing_cols_to_drop:
    print(f"Kolom yang akan dibuang: {existing_cols_to_drop}")
    df_platform_a = df_platform_a.drop(columns=existing_cols_to_drop)
    print("Pembersihan kolom selesai.")
else:
    print("Kolom sudah dibersihkan pada langkah sebelumnya.")

print("Kolom 'ADM4_EN' (Kecamatan) dipertahankan untuk analisis fitur.")

In [None]:
#  Diagnose "0 Bed, 0 Bath" Anomaly
# ---
print("Mengisolasi '0 Bedrooms AND 0 Bathrooms'...")

# Filter untuk 0 bedrooms DAN 0 bathrooms
zero_bed_zero_bath_df = df_platform_a[
    (df_platform_a['bedrooms'] == 0) & 
    (df_platform_a['bathrooms'] == 0)
].copy()

print(f"\nMenampilkan semua {len(zero_bed_zero_bath_df)} listings '0-Bed-0-Bath':")

# Menampilkan "simple df" yang Anda minta
print(zero_bed_zero_bath_df[[
    'id', 
    'source', 
    'price', 
    'master_address',
    'ADM4_EN', # Kolom kecamatan yang kita simpan
    'bedrooms', 
    'bathrooms', 
    'land_size_sqm', 
    'building_size_sqm',
    'description' # Menambahkan deskripsi untuk petunjuk
]])

21 ÷ 6,009 × 100 ≈ **0.35%** (# of listings with 0 bedrooms and bathrooms)
For the house price prediction model, a "House" listing must have bedrooms and bathrooms.  
Listings with `bedrooms = 0` **and** `bathrooms = 0` are, by definition, not residential houses.  
They are pure noise and will damage the model.  


In [None]:
# ---
# Drop "0 Bed, 0 Bath" Anomaly
# ---
print("Menghapus 21 listing '0-Bed-0-Bath'...")

rows_before = len(df_platform_a)
print(f"Listings 'Rumah' (sebelum filter): {rows_before:,}")

# 1. Cari index dari listing yang akan dibuang
outlier_indices = df_platform_a[
    (df_platform_a['bedrooms'] == 0) & 
    (df_platform_a['bathrooms'] == 0)
].index

print(f"Menemukan {len(outlier_indices)} listing '0-Bed-0-Bath' untuk dihapus.")

# 2. Drop listing tersebut dari df_platform_a
df_platform_a.drop(outlier_indices, inplace=True)

rows_after = len(df_platform_a)
print(f"Listings 'Rumah' (setelah filter): {rows_after:,}")
print(f"Total {rows_before - rows_after} listing telah dihapus.")

print("\n--- Step Selesai ---")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
import numpy as np

# --- 1. Define our "common sense" thresholds ---
# We are checking for any property with a size < 20 sqm
MIN_SIZE_THRESHOLD = 20
MAX_SIZE_THRESHOLD = 2000 # This is for the green visual box

# --- 2. Create the new column for coloring (Your "Red Dot" request) ---
print(f"Analyzing all {len(df_platform_a)} listings for size typos...")
print("Creating 'size_category' for visualization...")
def categorize_size(row):
    # Check for "Impossible Typos" first
    # This checks if *either* land or building is impossibly small
    if row['land_size_sqm'] < MIN_SIZE_THRESHOLD or row['building_size_sqm'] < MIN_SIZE_THRESHOLD:
        return 'Impossible (Typo)'
    else:
        return 'Normal'

# Apply this logic to create a new column
# This code runs on ALL 5,988 rows in df_platform_a
df_platform_a['size_category'] = df_platform_a.apply(categorize_size, axis=1)

# This printout should now sum to 5,988
print(df_platform_a['size_category'].value_counts())
print("\nDiagnosing size anomalies (land vs. building)...")

# --- 3. Create the Plot ---
plt.figure(figsize=(10, 10))

# This scatterplot will now plot all 5,988 points
ax = sns.scatterplot(
    data=df_platform_a, 
    x='land_size_sqm', 
    y='building_size_sqm', 
    alpha=0.6,
    zorder=2,
    hue='size_category', # <-- This colors the dots
    palette={'Normal': 'blue', 'Impossible (Typo)': 'red'} # <-- This sets the colors
)

# Set plot limits
x_max_lim = 5000
y_max_lim = 5000
ax.set_xlim(0, x_max_lim)
ax.set_ylim(0, y_max_lim)

# --- Add the Green Grid (Statistical Outlier Zone) ---
highlight_color = 'lightgreen'
highlight_alpha = 0.3

# Add a vertical span for Land > 2000
ax.axvspan(
    MAX_SIZE_THRESHOLD, 
    x_max_lim, 
    color=highlight_color, 
    alpha=highlight_alpha, 
    zorder=-2
)
# Add a horizontal span for Building > 2000
ax.axhspan(
    MAX_SIZE_THRESHOLD, 
    y_max_lim, 
    color=highlight_color, 
    alpha=highlight_alpha, 
    zorder=-2
)
# --- End of Green Grid ---

# Add the red dashed line for y=x
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),
    np.max([ax.get_xlim(), ax.get_ylim()]),
]
ax.plot(lims, lims, 'r--', alpha=0.75, zorder=1, label='y=x (Land = Building)')

ax.set_title('Diagnosis: All Size Outliers (Typos & Statistical)', fontsize=16)
ax.set_xlabel('Land Size (sqm)', fontsize=12)
ax.set_ylabel('Building Size (sqm)', fontsize=12)
ax.legend()
ax.grid(True, zorder=-1)

plt.show()

# --- 4. Inspect the "Typos" (The Red Dots) ---
print("\n" + "="*50)
print(f"Inspecting 'Impossible (Typo)' listings (< {MIN_SIZE_THRESHOLD} sqm):")

# We filter the 5,988-row dataframe to find only the new 'Impossible (Typo)' ones
impossible_mins_df = df_platform_a[df_platform_a['size_category'] == 'Impossible (Typo)']

print(impossible_mins_df[
    ['id', 'master_address', 'price', 'land_size_sqm', 'building_size_sqm', 'bedrooms']
].sort_values(by='building_size_sqm'))


# --- 5. Clean up the temporary column ---
# We drop this column so it doesn't interfere with later steps
df_platform_a = df_platform_a.drop(columns=['size_category'])
print("\nTemporary 'size_category' column removed.")

**Observation:**  
- Vast majority of our Rumah data is concentrated in the 0–2000 sqm range.  The listings in the light-green grid (where `land_size_sqm > 2000` OR `building_size_sqm > 2000`) are the "Statistical Impossibilities" (outliers) we discussed. These are not houses; they are commercial properties, warehouses, or palaces that were misclassified.  
- Some data are physically nonsensical. It represents a data entry error, not a real property. We can call them a **Logical Impossibility (Typos)**  
This is data that has `land_size_sqm` or `building_size_sqm` that is at or near zero. (I use < 20 sqm as a "common sense" filter.)  
A 1 sqm, 5 sqm, or even 15 sqm property is not a house. 
Listing #4563 (with `building_size_sqm = 1.00`) is the perfect example.  
Such entries are invalid data points and must be removed.  

In [None]:
# --- 1. Define our min and max "sensible" thresholds ---
MIN_SIZE_THRESHOLD = 20
MAX_SIZE_THRESHOLD = 2000

print(f"--- Step: Filtering for Sensible Sizes ---")
initial_count = len(df_platform_a)
print(f"Listings (sebelum filter): {initial_count}")

# --- 2. Create the filter conditions ---
# We want to KEEP rows that meet ALL of these conditions:
condition_keep = (
    (df_platform_a['land_size_sqm'] >= MIN_SIZE_THRESHOLD) &
    (df_platform_a['building_size_sqm'] >= MIN_SIZE_THRESHOLD) &
    (df_platform_a['land_size_sqm'] <= MAX_SIZE_THRESHOLD) &
    (df_platform_a['building_size_sqm'] <= MAX_SIZE_THRESHOLD)
)

# --- 3. Apply the filter ---
# We create a new dataframe 'df_platform_a_cleaned' by selecting only the rows
# that match our 'condition_keep'.
df_platform_a_cleaned = df_platform_a[condition_keep].copy()

# --- 4. Report and Finalize ---
final_count = len(df_platform_a_cleaned)
removed_count = initial_count - final_count

print(f"Listings (setelah filter): {final_count}")
print(f"Total {removed_count} listings (typos & outliers) telah dihapus.")

# --- 5. Overwrite the main dataframe ---
# Our main dataframe 'df_platform_a' is now the cleaned version
df_platform_a = df_platform_a_cleaned

print("\nDataframe 'df_platform_a' telah diperbarui.")
print("--- Step Selesai ---")

# Optional: You can uncomment this to see the new shape
# print(df_platform_a.shape)

In [None]:
# --- Step: Isolating 0-Bedroom Listings ---
print(f"Analyzing {len(df_platform_a)} listings (after size cleaning)...")

# --- 1. Define the minimum threshold ---
MIN_BEDROOMS_NORMAL = 1

# --- 2. Filter 'df_platform_a' (5,954 listings) ---
# Find listings with bedrooms < 1 (i.e., 0 bedrooms)
low_outliers_df = df_platform_a[df_platform_a['bedrooms'] < MIN_BEDROOMS_NORMAL].copy()

print(f"\nFound {len(low_outliers_df)} '0-Bedroom' (Outlier (Low)) listings.")

# --- 3. Display the details of these listings ---
print("Displaying details for these listings:")

# This print statement is corrected from your snippet
print(low_outliers_df[[
    'id', 
    'source', 
    'price', 
    'master_address',
    'ADM4_EN', 
    'bedrooms', 
    'bathrooms', 
    'land_size_sqm', 
    'building_size_sqm'  # <-- Fixed the extra quote here
]])

print("\n'low_outliers_df' is ready for inspection.")
print("--- Step Selesai ---")

In [None]:
low_outliers_df

**What's No Longer an Anomaly**  

**Size Typos: FIXED**  
All the `land_size_sqm` and `building_size_sqm` values in your sample (60, 198, 824, 89, etc.) are valid and fall within our sensible 20–2000 sqm range.  

**0-Bed/0-Bath: FIXED**  
All the listings in your sample have at least one bathroom.  

**Location: FIXED**  
We can see ADM4_EN (kecamatan) names like Mekarjaya, Sukamaju, and Dago, confirming they are all inside Bandung.  


**Remaining Anomalies**  

We are left with listings that have 0 bedrooms but 1, 2, 3, or even 5 bathrooms.  
A residential house with 5 bathrooms and 0 bedrooms is a logical impossibility.  

The most likely explanation is that these are misclassified properties such as:  
- Ruko (Shophouse)  
- Kantor (Office)  
- Restoran (Restaurant)  
- Gudang (Warehouse)  

---

**Evidence**  

**Commercial Use**  
- Index 4370 (6.5 Billion IDR): The description explicitly says *"Rumah/ Ruang Usaha Mainroad Moch Toha"* (House / Commercial Space).  
- Index 803 (34.6 Billion IDR): The price is enormous for a 0-bed, 1-bath property. The description *"Rumah Mainroad Dago"* confirms it is a commercial-value property on a main road.  

**Logical Impossibility (likely commercial or error)**  
- Index 2843 (5 Billion IDR): Has 0 bedrooms and 5 bathrooms.  
- Index 4108 (1.4 Billion IDR): Also has 0 bedrooms and 5 bathrooms.  
- Index 6356 (2.75 Billion IDR): Also has 0 bedrooms and 5 bathrooms.  
- Index 570 (2.6 Billion IDR): Has 0 bedrooms and 4 bathrooms.  

**Likely Data Error**  
- Index 301 (690 Million IDR): Has 0 bedrooms and 3 bathrooms. At this price, it is most likely the agent simply forgot to enter the bedroom count.  

11 out of 5,954 is $0.1847\%$.This is a very small fraction (less than one-fifth of one percent) of the total dataset. I will remove them.


In [None]:
# Removal of 0-Bedroom Listings
MIN_BEDROOMS_NORMAL = 1
initial_count = len(df_platform_a)

print(f"Listings (sebelum filter): {initial_count}")

# --- 2. Create the filter condition ---
# We want to KEEP rows where bedrooms are 1 or more
condition_keep = (df_platform_a['bedrooms'] >= MIN_BEDROOMS_NORMAL)

# --- 3. Apply the filter ---
# Overwrite 'df_platform_a' with the filtered version
df_platform_a = df_platform_a[condition_keep].copy()

# --- 4. Report and Finalize ---
final_count = len(df_platform_a)
removed_count = initial_count - final_count

print(f"Listings (setelah filter): {final_count}")
print(f"Total {removed_count} listing '0-Bedroom' telah dihapus.")
print("\nDataframe 'df_platform_a' telah diperbarui.")
print("--- Step Selesai ---")

In [None]:
df_platform_a.describe()

**Current Anomalies**

**1. Extreme Price Outliers**  
This is the most obvious issue.  

- 75% (Upper Quartile): 4.7 Billion IDR  
- Max: 950 Billion IDR  

A price of 950 Billion is not a house; it is a massive development project, a huge typo (extra zeros), or a commercial property. It will break any statistical model.  

---

**2. Extreme High Bedrooms/Bathrooms**  
These are not standard houses and are likely misclassified.  

- **Bedrooms:** The 75th percentile is 5, but the maximum is 60. A 60-bedroom "house" is a Kost (boarding house) or a small hotel.  
- **Bathrooms:** The 75th percentile is 3, but the maximum is 57. This is also not a residential home.  

---

**3. Zero Bathrooms**  
- Bathrooms: The minimum is 0.00.  
- We previously removed listings with 0 beds **and** 0 baths.  
- Now we have listings with 1+ beds but 0 baths.  

This is a strong indicator of a data entry error and is just as problematic as the 0-bedroom listings.  


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --- 1. Define our thresholds ---
MAX_ROOM_THRESHOLD = 20 

# --- 2. Create the new column for coloring ---
print(f"Analyzing all {len(df_platform_a)} listings for room anomalies...")
print("Creating 'room_category' for visualization...")

def categorize_rooms(row):
    # Priority 1: Check for Typos (0 Bathrooms)
    if row['bathrooms'] < 1:
        return 'Impossible (Typo)'
    # Priority 2: Check for High Outliers (> 20 Rooms)
    elif row['bedrooms'] > MAX_ROOM_THRESHOLD or row['bathrooms'] > MAX_ROOM_THRESHOLD:
        return 'Outlier (High)'
    # Priority 3: Normal
    else:
        return 'Normal'

# Apply this logic
df_platform_a['room_category'] = df_platform_a.apply(categorize_rooms, axis=1)

print(df_platform_a['room_category'].value_counts())

# --- 3. Create Jittered Data for Plotting ---
x_jittered = df_platform_a['bedrooms'] + np.random.normal(0, 0.25, size=len(df_platform_a))
y_jittered = df_platform_a['bathrooms'] + np.random.normal(0, 0.25, size=len(df_platform_a))

# --- 4. Create the Plot ---
plt.figure(figsize=(10, 10))

ax = sns.scatterplot(
    x=x_jittered, 
    y=y_jittered, 
    alpha=0.6, 
    zorder=2,
    hue=df_platform_a['room_category'], 
    # Define the exact colors you asked for
    palette={
        'Normal': 'orange', 
        'Impossible (Typo)': 'red', 
        'Outlier (High)': 'black'
    } 
)

# Set plot limits
max_lim = 65 
ax.set_xlim(-1, max_lim)
ax.set_ylim(-1, max_lim)

# --- Add the Blue Grid (Statistical Outlier Zone) ---
highlight_color = 'lightblue'
highlight_alpha = 0.3

# Vertical span (Bedrooms > 20)
ax.axvspan(
    MAX_ROOM_THRESHOLD, 
    max_lim, 
    color=highlight_color, 
    alpha=highlight_alpha, 
    zorder=-2
)
# Horizontal span (Bathrooms > 20)
ax.axhspan(
    MAX_ROOM_THRESHOLD, 
    max_lim, 
    color=highlight_color, 
    alpha=highlight_alpha, 
    zorder=-2
)

# Add the red dashed line for y=x
lims = [0, max_lim]
ax.plot(lims, lims, 'r--', alpha=0.75, zorder=1, label='y=x (Bedrooms = Bathrooms)')

ax.set_title('Diagnosis: Bedrooms vs Bathrooms (Black=High Outlier, Red=Typo)', fontsize=16)
ax.set_xlabel('Bedrooms (Jittered)', fontsize=12)
ax.set_ylabel('Bathrooms (Jittered)', fontsize=12)

ax.legend(loc='upper left')
ax.grid(True, zorder=-1)

plt.show()

# --- 5. Inspect the "Typos" (The Red Dots) ---
print("\n" + "="*50)
print("Inspecting 'Impossible (Typo)' listings (0 Bathrooms):")

# Filter for the red dots
impossible_rooms_df = df_platform_a[df_platform_a['room_category'] == 'Impossible (Typo)']

if len(impossible_rooms_df) > 0:
    print(impossible_rooms_df[
        ['id', 'price', 'bedrooms', 'bathrooms', 'land_size_sqm', 'building_size_sqm']
    ].sort_values(by='bedrooms'))
else:
    print("No 0-bathroom listings found!")

# Clean up
df_platform_a = df_platform_a.drop(columns=['room_category'])
print("\nTemporary 'room_category' column removed.")

**Graph Analysis (The "Map")**  

**Red Dots (0-Bathroom Error):**  
A row of red dots along y=0 confirms listings with valid bedroom counts (2, 3, 4, even 19) but zero bathrooms.  

9 Typos (0 Bathrooms)

**Black Dots (High Outliers):**  
- Properties with more than 20 bedrooms or bathrooms.  
- One extreme outlier has about 60 bedrooms.  
- Another cluster has 20–40 bedrooms.  

These are clearly non-residential (Kost, Hotels) and should be removed.  50 High Outliers (Commercial/Kost)
We are targeting exactly 59 listings for removal.

**Big Picture:**  
Most of the data (the orange cloud) looks normal. It clusters in the bottom-left, showing that most houses in Bandung have 1–5 bedrooms and 1–4 bathrooms. This is the expected pattern.  
5943 - 59 = 5887 remaining listings.


In addition, i will remove 1 particular listing. It's a "Normal" house by our current definition (not 0 bathrooms, not >20 rooms), but it visually sticks out in an unexpected way. It looks like it has around 1-2 bedrooms but about 21 bathrooms. This is a very clear anomaly for a residential home.

In [None]:
# --- Consolidated Fix: Cleaning Room Anomalies ---
print("--- Step: Finalizing Room Anomalies (Typos & High Outliers) ---")

# 1. Capture the starting state
initial_count = len(df_platform_a)
print(f"Listings start: {initial_count}")

# 2. Define Thresholds
MIN_BATHROOMS = 1        # Must have at least 1 bathroom
MAX_ROOM_THRESHOLD = 20  # Max 20 beds or baths

# 3. Apply the Filter
# We KEEP rows that meet ALL these criteria:
condition_keep = (
    (df_platform_a['bathrooms'] >= MIN_BATHROOMS) &
    (df_platform_a['bedrooms'] <= MAX_ROOM_THRESHOLD) &
    (df_platform_a['bathrooms'] <= MAX_ROOM_THRESHOLD)
)

# 4. Execute the Drop
df_platform_a_cleaned = df_platform_a[condition_keep].copy()

# 5. Calculate what was removed
final_count = len(df_platform_a_cleaned)
removed_count = initial_count - final_count

print(f"Listings end  : {final_count}")
print(f"Total removed : {removed_count}")

# 6. Verify ID 1788 (The 21-bathroom house)
# We check if it exists in the CLEANED data. It should be empty.
check_1788 = df_platform_a_cleaned[df_platform_a_cleaned['bathrooms'] == 21]
if check_1788.empty:
    print("\nVerification: Listing with 21 bathrooms (ID 1788) was successfully removed by the threshold filter.")
else:
    print("\nWarning: ID 1788 still exists!")

# 7. Overwrite the main dataframe
df_platform_a = df_platform_a_cleaned
print("--- Step Selesai ---")

In [None]:
df_platform_a.describe()

**Remaining Anomalies Are Entirely in the Price**  

**1. The "Impossible" Maximum (950 Billion IDR)**  
- **Stat:** Max Price = 950,000,000,000  
- **Logic:** A price of 950 Billion is ~200 times higher than the "expensive" houses in the dataset (75th percentile is 4.6 Billion).  
- **Diagnosis:** This is likely a massive development project (selling 100 units at once), a large commercial complex, or a typo (extra zeros). It distorts the Mean and Standard Deviation.  

---

**2. The "Rental" Minimum (45 Million IDR)**  
- **Stat:** Min Price = 45,000,000  
- **Logic:** You cannot buy a house in Bandung for 45 Million. Even subsidized housing (*Rumah Subsidi*) is usually >150 Million.  
- **Diagnosis:** This is almost certainly a rental listing (per year) misclassified as a sale, or possibly just a small plot of land.  


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step: Price Diagnosis (Red, Yellow, Green) ---
print(f"Applying Price Logic to {len(df_platform_a)} listings...")

# 1. Define Thresholds
MIN_PRICE_THRESHOLD = 150_000_000       # 150 Juta (Likely Typos/Rentals)
MAX_PRICE_THRESHOLD = 100_000_000_000   # 100 Milyar (Luxury/Commercial)

# 2. Create Categories for Coloring
def categorize_price(price):
    if price < MIN_PRICE_THRESHOLD:
        return 'Impossible (Low)'
    elif price > MAX_PRICE_THRESHOLD:
        return 'Outlier (High)'
    else:
        return 'Normal'

df_platform_a['price_category'] = df_platform_a['price'].apply(categorize_price)

print("\nCounts by Category:")
print(df_platform_a['price_category'].value_counts())

# 3. Visualization
plt.figure(figsize=(12, 8))

# Convert Price to Billions for plotting
df_platform_a['price_milyar'] = df_platform_a['price'] / 1_000_000_000

ax = sns.scatterplot(
    data=df_platform_a,
    x='land_size_sqm',
    y='price_milyar',
    hue='price_category',
    palette={
        'Normal': 'green', 
        'Outlier (High)': 'red', 
        'Impossible (Low)': 'yellow'
    },
    alpha=0.7,
    s=60,
    edgecolor='black', # Adds a black outline so Yellow dots are visible
    linewidth=0.5
)

ax.set_title('Price Diagnosis: Red (>100B), Yellow (<150M), Green (Normal)', fontsize=16)
ax.set_xlabel('Land Size (sqm)', fontsize=12)
ax.set_ylabel('Price (Billions IDR)', fontsize=12)
ax.grid(True, alpha=0.3)

# Force Y-axis to plain numbers
plt.ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.show()

# 4. Inspecting the "Yellow" (Low) Listings
print("\n" + "="*50)
print("INSPECTION: Impossible (Low) Listings (< 150 Juta)")
low_outliers = df_platform_a[df_platform_a['price_category'] == 'Impossible (Low)']

if not low_outliers.empty:
    print(low_outliers[['id', 'price', 'land_size_sqm', 'master_address']].sort_values(by='price'))
else:
    print("No listings found below 150 Million IDR.")

# Clean up
df_platform_a = df_platform_a.drop(columns=['price_milyar', 'price_category'])
print("\n--- Step Selesai ---")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step: Mapping the "Normal Zone" (Zoomed In) ---
print(f"Zooming in on the market (filtering out > 100 Billion temporarily)...")

# 1. Define Thresholds
MAX_PRICE_VIEW = 100_000_000_000  # 100 Milyar (The Ceiling for this view)
MIN_PRICE_TYPO = 150_000_000      # 150 Juta (The Floor for Typos)

# 2. Create a temporary subset for visualization ONLY
#    We keep everything <= 100 Billion
df_zoom = df_platform_a[df_platform_a['price'] <= MAX_PRICE_VIEW].copy()

# 3. Define categories for this zoomed view
#    Since we removed the 'Red' (>100B), we only have 'Normal' and 'Impossible (Low)'
def categorize_zoomed(price):
    if price < MIN_PRICE_TYPO:
        return 'Impossible (Low)'
    else:
        return 'Normal'

df_zoom['price_category'] = df_zoom['price'].apply(categorize_zoomed)

print(f"Showing {len(df_zoom)} listings (Hidden {len(df_platform_a) - len(df_zoom)} extreme outliers).")

# 4. Visualization
plt.figure(figsize=(12, 8))

# Convert to Billions for Y-axis
df_zoom['price_milyar'] = df_zoom['price'] / 1_000_000_000

ax = sns.scatterplot(
    data=df_zoom,
    x='land_size_sqm',
    y='price_milyar',
    hue='price_category',
    palette={
        'Normal': 'green', 
        'Impossible (Low)': 'yellow'
    },
    alpha=0.5, # Lower alpha to see density in the "Normal" cloud
    s=50,
    edgecolor='black',
    linewidth=0.3
)

ax.set_title(f'The "Real" Market Shape (Zoomed: Price <= 100 Billion)', fontsize=16)
ax.set_xlabel('Land Size (sqm)', fontsize=12)
ax.set_ylabel('Price (Billions IDR)', fontsize=12)
ax.grid(True, alpha=0.3)

# Force Y-axis to plain numbers
plt.ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.show()

print("--- Zoomed Map Generated ---")

In [None]:
# --- Step: Removing Price Anomalies (Inplace) ---
print(f"--- Step: Cleaning Price Outliers & Typos ---")

# 1. Define the Thresholds (as agreed)
MIN_PRICE_TYPO = 150_000_000        # < 150 Juta (Remove)
MAX_PRICE_OUTLIER = 100_000_000_000 # > 100 Milyar (Remove)

initial_count = len(df_platform_a)
print(f"Listings (sebelum filter): {initial_count}")

# 2. Create the filter condition
# We KEEP rows that are within the "Normal" range
condition_keep = (
    (df_platform_a['price'] >= MIN_PRICE_TYPO) & 
    (df_platform_a['price'] <= MAX_PRICE_OUTLIER)
)

# 3. Apply the filter (Overwrite df_platform_a)
df_platform_a = df_platform_a[condition_keep].copy()

# 4. Report results
final_count = len(df_platform_a)
removed_count = initial_count - final_count

print(f"Listings (setelah filter): {final_count}")
print(f"Total {removed_count} listings removed.")
print(f"   (Removed: Prices < {MIN_PRICE_TYPO:,.0f} or > {MAX_PRICE_OUTLIER:,.0f})")
print("\nDataframe 'df_platform_a' updated.")
print("--- Step Selesai ---")

In [None]:
df_platform_a.describe()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

print("--- Plotting Final Verification Map (Price Heatmap) ---")

# 1. Load the Shapefile
shapefile_path = r"..\data\raw\idn_admbnda_adm4_ID3_bps_20200401.shp"
print("Loading shapefile...")

try:
    gdf_adm = gpd.read_file(shapefile_path)
    
    # Filter for Kota Bandung
    bandung_gdf = gdf_adm[gdf_adm['ADM2_EN'] == 'Kota Bandung'].copy()
    bandung_gdf = bandung_gdf.to_crs("EPSG:4326") # Ensure Standard Lat/Lon
    print("Bandung map boundaries loaded.")

    # 2. Convert clean 'df_platform_a' to GeoDataFrame
    gdf_listings_filtered = gpd.GeoDataFrame(
        df_platform_a, 
        geometry=gpd.points_from_xy(df_platform_a.longitude, df_platform_a.latitude), 
        crs="EPSG:4326"
    )
    print(f"Loaded {len(gdf_listings_filtered)} listings for plotting.")

    # 3. Plot the Map
    fig, ax = plt.subplots(figsize=(12, 12))
    ax.set_aspect('equal')
    
    # A. Plot Base Map (Grey Background)
    bandung_gdf.plot(
        ax=ax, 
        edgecolor='black', 
        facecolor='#dddddd', 
        label='Kota Bandung Boundary',
        zorder=1
    )
    
    # B. Plot Listings (Colored by Price)
    gdf_listings_filtered.plot(
        ax=ax, 
        column='price',        # Use price for color
        cmap='viridis_r',      # Reverse Viridis (Purple=High, Yellow=Low usually, or vice versa depending on version)
        markersize=15,         # Size of dots
        alpha=0.7,             # Transparency
        legend=True,
        legend_kwds={
            'label': "Price (IDR)", 
            'shrink': 0.6,
            'format': "%.0e"   # Scientific notation for cleaner legend
        },
        vmax=15_000_000_000,   # CAP visual scale at 15 Billion so normal houses show variation
        zorder=2
    )
    
    # C. Set Verification Limits (Your specific coordinates)
    ax.set_xlim(107.55, 107.74)
    ax.set_ylim(-6.98, -6.83)
    
    # D. Formatting
    ax.set_title(f'Verification: {len(gdf_listings_filtered)} Clean Listings (Colored by Price)', fontsize=16)
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    
    plt.grid(True, linestyle='--', alpha=0.5)
    
    plt.savefig('kota_bandung_clean_price_map.png')
    print("Map generation complete. Saved as 'kota_bandung_clean_price_map.png'")
    plt.show()

except Exception as e:
    print(f"An error occurred: {e}")

We are done with Data Cleaning. We are ready for Feature Engineering (e.g., Price per Square Meter) or Exploratory Analysis.

In [None]:
import os

# --- Final Step: Saving to 'data/processed' ---

# 1. Define the directory and filename
save_dir = r"..\data\processed"
filename = "df_platform_a_bandung_cleaned.csv"

# 2. Create the full path
full_path = os.path.join(save_dir, filename)

print(f"Saving {len(df_platform_a)} cleaned listings to:\n{full_path}...")

# 3. Save (Index=False)
# Ensure the directory exists, just in case
os.makedirs(save_dir, exist_ok=True)
df_platform_a.to_csv(full_path, index=False)

print("✅ Save Complete.")
print("-" * 30)
print(f"Final Data Shape: {df_platform_a.shape}")
print("-" * 30)