In [None]:
### NOTEBOOK 3: 03_deduplicate.ipynb ###
#
# GOAL: Memuat file master yang sudah bersih dan menerapkan
#       deduplikasi 3-tahap (ID, Price/Address, Geospatial)
#       untuk membuat dataset final.
#
# INPUT: master_cleaned_features.csv
# OUTPUT: bandung_housing_FINAL.csv
#

import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
from tqdm import tqdm
import warnings

# Geospatial
import geopandas as gpd
from shapely.geometry import Point
from sklearn.cluster import DBSCAN

# Suppress warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
tqdm.pandas()

print("--- 03_deduplicate.ipynb ---")

In [None]:
# ---
# ## Step 1: File Paths
# ---

print("Step 1: Setting up file paths...")

# Path Definitions
PROJECT_ROOT = Path(r"..")
DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
NOTEBOOK_DIR = PROJECT_ROOT / "notebooks" # Lokasi notebook ini

# --- INPUT FILE (Failsafe dari Notebook 2) ---
CLEANED_MASTER_PATH = PROCESSED_DIR / "master_cleaned_features.csv"

# --- OUTPUT FILE (Final) ---
FINAL_OUTPUT_PATH = PROCESSED_DIR / "bandung_housing_FINAL.csv"

print(f"Input: {CLEANED_MASTER_PATH}")
print(f"Output: {FINAL_OUTPUT_PATH}\n")

In [None]:
# ---
# ## Step 2: Load Cleaned Master File
# ---

print("Step 2: Loading Cleaned Master File...")

try:
    # Tentukan tipe data untuk kolom yang bermasalah saat load
    dtypes = {'zipcode': 'str', 'geo_confidence': 'str'}
    df_master = pd.read_csv(CLEANED_MASTER_PATH, dtype=dtypes)
    print(f"Loaded {len(df_master)} records from {CLEANED_MASTER_PATH}\n")
except FileNotFoundError:
    print(f"❌ ERROR: File not found at {CLEANED_MASTER_PATH}. Aborting.")
    print("Pastikan Notebook 02 telah berhasil dijalankan.")
    sys.exit()
except Exception as e:
    print(f"❌ ERROR loading {CLEANED_MASTER_PATH}: {e}")
    sys.exit()

Data Cleaning

Next code is a 3‑stage "filter" to remove duplicates, with each stage becoming stricter.

Stage 1: Duplicates by ID (id)  
The most basic cleaning. The code looks for rows with the exact same id and removes all duplicates, keeping only the first (keep='first').  

Why: This captures the most obvious duplicates, where the same listing may have been scraped more than once, or recreated during the merging process.  

---

Stage 2: Duplicates by Price & Address (price, master_address)  
It looks for rows that have the EXACT SAME price AND the EXACT SAME master_address.  

Why: This captures "lazy duplicates." Imagine an agent posting the same ad on different property sites. Stage 1 would not catch this (because the ids differ), but Stage 2 does. It is a crucial step to ensure that every unique property is treated as a single data point. 

---

Stage 3: Geospatial Deduplication (Aggressive Logic)  
This is the most complex and important part, and will remove a great deal of data, but is a necessary process to make the data usable for analysis. 

1. Data Preparation  
The code splits the data into two groups:  
- clusterable_gdf: "good" listings (valid latitude, longitude, price, and land_size_sqm).  
- non_clusterable_df: "bad" listings (e.g., missing coordinates).  

The "bad" listings are safely stored and added back at the end. No data is lost (yet).  

2. Clustering (DBSCAN)  
The code uses an algorithm called DBSCAN. Around each data point, it puts a circle with a radius of 100 meters (eps=100).  

- If at least 2 listings (min_samples=2) fall within each other’s circle, they are grouped together and assigned a cluster ID (e.g., cluster 5).  
- If a listing stands alone (no other listing within 100m), it is labeled "noise" (cluster = -1).  

3. Multi-Factor Duplicate Check   
This is where the "data decimation" happens. The code iterates through each cluster:  
- A listing is removed only if it matches another listing in the same cluster on all key features.  
- A "true duplicate" means:  
  - Same `price`  
  - AND same `land_size_sqm`  
  - AND same `building_size_sqm`

I did this to remove only *true duplicates* while keeping valid, unique properties.  
Many listings can appear close together on the map, but that doesn’t always mean they are the same house or apartment. If we treated all clustered ads as duplicates, we would risk deleting real properties in the same building or complex.  

By checking for the same `price`, `land_size_sqm`, and `building_size_sqm`, the filter only removes ads that are truly identical. This avoids bias in the dataset, where one property could be counted too many times or valid properties could be lost.  

This logic is not always “good.” It is a trade-off:  
- **Good** when clusters contain repeated ads for the same property, since it reduces spam and improves accuracy.  
- **Risky** if the chosen features miss small differences between properties, which could let some duplicates stay or wrongly remove rare cases.  

4. Merging Back  
The final dataset is a combination of:  
- All "noise" listings (unique, stand‑alone properties).  
- All "non‑clusterable" listings (the "bad" data without coordinates) that were saved earlier.  
- From within the clusters, all unique properties are kept. Only the true identical‑feature duplicates are removed.  

In [None]:
# ---
# ## Step 3: 3-Stage Deduplication
# ---

print("---")
print("## Step 3: Running 3-Stage Deduplication")
print("---")

def geospatial_deduplication(df, cluster_radius_m=100):
    """
    Menjalankan deduplikasi geospasial menggunakan DBSCAN.
    
    LOGIKA BARU (SMART):
    Hanya menghapus listing di dalam sebuah cluster jika listing tersebut
    memiliki 'price', 'land_size_sqm', DAN 'building_size_sqm'
    yang SAMA PERSIS dengan listing lain di cluster yang sama.
    
    Ini memecahkan "Masalah Gedung Apartemen" dan "Masalah Heuristik Buruk".
    """
    print(f"Starting SMART geospatial deduplication with {len(df)} listings...")
    gdf = df.copy()
    
    # 1. Siapkan data untuk clustering
    gdf['latitude'] = pd.to_numeric(gdf['latitude'], errors='coerce')
    gdf['longitude'] = pd.to_numeric(gdf['longitude'], errors='coerce')
    gdf['price'] = pd.to_numeric(gdf['price'], errors='coerce')
    gdf['land_size_sqm'] = pd.to_numeric(gdf['land_size_sqm'], errors='coerce')
    gdf['building_size_sqm'] = pd.to_numeric(gdf['building_size_sqm'], errors='coerce') # Perlu untuk logika baru

    # Hanya cluster baris yang memiliki koordinat
    gdf['can_cluster'] = gdf['latitude'].notna() & gdf['longitude'].notna()
    
    clusterable_gdf = gdf[gdf['can_cluster']].copy()
    non_clusterable_df = gdf[~gdf['can_cluster']].copy() # Simpan baris yang tidak bisa di-cluster
    
    if clusterable_gdf.empty:
        print("No data with valid coordinates to cluster.")
        return df
        
    # 2. Proyeksikan ke CRS meteran (Mercator) untuk DBSCAN
    clusterable_gdf['geometry'] = [Point(xy) for xy in zip(clusterable_gdf['longitude'], clusterable_gdf['latitude'])]
    clusterable_gdf = gpd.GeoDataFrame(clusterable_gdf, geometry='geometry', crs="EPSG:4326")
    clusterable_gdf = clusterable_gdf.to_crs("EPSG:3857") # Proyeksi Mercator (dalam meter)
    
    # 3. Jalankan DBSCAN
    coords = np.array(list(zip(clusterable_gdf.geometry.x, clusterable_gdf.geometry.y)))
    db = DBSCAN(eps=cluster_radius_m, min_samples=2, metric='euclidean').fit(coords)
    clusterable_gdf['cluster'] = db.labels_
    
    print(f"Found {len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)} geospatial clusters.")

    # 4. Terapkan Logika Deduplikasi BARU (SMART)
    
    # --- AWAL DARI LOGIKA BARU ---
    # Logika ini didasarkan pada file .docx Anda:
    # "nearby AND same price AND same land_size AND same building_size"
    
    keep_indices = set() # Set untuk menyimpan indeks baris yang ingin *disimpan*

    # Simpan semua baris yang "noise" (tidak termasuk dalam cluster manapun)
    noise_indices = set(clusterable_gdf[clusterable_gdf['cluster'] == -1].index)
    keep_indices.update(noise_indices)

    # Iterasi HANYA melalui cluster yang valid (bukan -1)
    for cluster_id in set(clusterable_gdf['cluster']):
        if cluster_id == -1:
            continue
            
        cluster_listings = clusterable_gdf[clusterable_gdf['cluster'] == cluster_id]
        
        # Tentukan kolom untuk memeriksa duplikasi
        # Kita hanya peduli tentang duplikat jika fitur-fitur ini SAMA PERSIS.
        duplicate_check_cols = ['price', 'land_size_sqm', 'building_size_sqm']
        
        # Gunakan drop_duplicates pada subset cluster
        # Ini akan menyimpan *satu* salinan dari setiap kombinasi unik
        # 'price'/'land_size'/'building_size' di dalam cluster tersebut.
        survivor_listings = cluster_listings.drop_duplicates(
            subset=duplicate_check_cols,
            keep='first'
        )
        
        # Tambahkan indeks dari listing yang selamat ke set kita
        keep_indices.update(survivor_listings.index)
    
    # --- AKHIR DARI LOGIKA BARU ---

    # 5. Gabungkan kembali DataFrame
    final_clustered_df = df.loc[list(keep_indices)] # Ambil baris yang disimpan dari df asli
    final_df = pd.concat([final_clustered_df, non_clusterable_df], ignore_index=True) # Tambahkan kembali baris yang tidak di-cluster
    
    print(f"SMART geospatial deduplication removed {len(df) - len(final_df)} listings.")
    return final_df

# ========================================================================
# ---                 AKHIR DARI FUNGSI YANG DIPERBAIKI                ---
# ========================================================================


# --- Menjalankan 3-Tahap Deduplikasi ---

# Tahap A: Deduplikasi berdasarkan 'id' unik
print("Running Stage A: Deduplicate by 'id'...")
count_a = len(df_master)
df_master.drop_duplicates(subset=['id'], keep='first', inplace=True)
print(f"Removed {count_a - len(df_master)} duplicates by 'id'.\n")

# Tahap B: Deduplikasi berdasarkan 'price' dan 'master_address'
print("Running Stage B: Deduplicate by 'price' & 'master_address'...")
count_b = len(df_master)
df_master.drop_duplicates(subset=['price', 'master_address'], keep='first', inplace=True)
print(f"Removed {count_b - len(df_master)} duplicates by 'price'/'master_address'.\n")

# Tahap C: Deduplikasi Geospasial (Logika SMART)
print("Running Stage C: Geospatial Deduplication (SMART Logic)...")
df_master = geospatial_deduplication(df_master, cluster_radius_m=100) # radius 100m
print("SMART Geospatial deduplication complete.\n")

print(f"Total listings remaining after 3-stage deduplication: {len(df_master)}")

In [None]:
# ---
# ## Step 4: Save Final File
# ---

print("---")
print("## Step 4: Saving Final File")
print("---")

try:
    # Tentukan urutan kolom akhir
    final_columns = [
        'id', 'source', 'price', 'master_address', 'latitude', 'longitude', 'zipcode',
        'bedrooms', 'bathrooms', 'land_size_sqm', 'building_size_sqm',
        'description', 'specs', 'url', 'scraped_at', 'geo_confidence'
    ]
    
    # Filter kolom yang ada di dataframe
    final_columns_exist = [col for col in final_columns if col in df_master.columns]
    df_final = df_master[final_columns_exist]

    # Simpan ke nama file output yang *asli*
    df_final.to_csv(FINAL_OUTPUT_PATH, index=False, encoding='utf-8-sig')
    
    print(f"\n✅✅✅ 03_deduplicate.ipynb COMPLETE! ✅✅✅")
    print(f"Final dataset saved to:")
    print(f"{FINAL_OUTPUT_PATH}")
    print(f"\nTotal listings in final file: {len(df_final)}")
    
except Exception as e:
    print(f"❌ ERROR: Failed to save final file. {e}")