In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the preprocessed data from Part 1
try:
    df_clustering = pd.read_pickle('../DataSets/clustering_results_part1.pkl')
    # Remove any existing cluster columns
    df_clustering = df_clustering.drop(['cluster_10'], axis=1, errors='ignore')
    print("Loaded preprocessed data from Part 1")
except:
    print("Please run Part 1 first to generate the preprocessed data")
    exit()

print(f"Dataset shape: {df_clustering.shape}")

In [None]:
print("\n" + "="*60)
print("DBSCAN CLUSTERING ANALYSIS")
print("="*60)

print(f"\nAs requested, using only 3 features for DBSCAN:")
print("  • utm_x (Geographic X coordinate)")
print("  • utm_y (Geographic Y coordinate)")  
print("  • transformable_price (Price)")

# Select only UTM coordinates and price for DBSCAN
dbscan_features = ['utm_x', 'utm_y', 'transformable_price']
df_dbscan = df_clustering[dbscan_features].copy()

print(f"\nDBSCAN dataset shape: {df_dbscan.shape}")
print(f"Features summary:")
print(df_dbscan.describe())

In [None]:

# Standardize features for DBSCAN
scaler_dbscan = StandardScaler()
X_dbscan_scaled = scaler_dbscan.fit_transform(df_dbscan)

print(f"\nData standardized for DBSCAN clustering")


In [None]:

# =====================================================
# HYPERPARAMETER GRID SEARCH FOR 3 CLUSTERS
# =====================================================

print(f"\n" + "="*50)
print("SEARCHING FOR OPTIMAL DBSCAN PARAMETERS")
print("="*50)

print(f"Goal: Find parameters that produce exactly 3 meaningful clusters")
print(f"Constraints: Less than 30% noise points")

# Define parameter ranges
eps_values = np.arange(0.1, 2.0, 0.1)
min_samples_values = range(5, 51, 5)

best_score = -1
best_params = None
best_n_clusters = None
best_labels = None
results_log = []

# For memory efficiency with large datasets
max_sample_silhouette = min(1000, len(X_dbscan_scaled))
sample_indices = np.random.choice(len(X_dbscan_scaled), max_sample_silhouette, replace=False)

print(f"Testing {len(eps_values)} eps values × {len(min_samples_values)} min_samples values = {len(eps_values) * len(min_samples_values)} combinations")
print(f"Using sample of {max_sample_silhouette} points for silhouette evaluation")

print(f"\nProgress (searching for 3-cluster solutions):")
valid_solutions = 0

for i, eps in enumerate(eps_values):
    for j, min_samples in enumerate(min_samples_values):
        # Apply DBSCAN
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=1)
        labels = dbscan.fit_predict(X_dbscan_scaled)
        
        # Calculate cluster statistics
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = np.sum(labels == -1)
        noise_ratio = n_noise / len(labels)
        
        # Only consider solutions with exactly 3 clusters and reasonable noise
        if n_clusters == 3 and noise_ratio < 0.3:
            try:
                # Calculate silhouette score on sample
                labels_sample = labels[sample_indices]
                X_sample = X_dbscan_scaled[sample_indices]
                
                # Only calculate if we have all clusters represented in sample
                unique_labels_sample = set(labels_sample)
                if len(unique_labels_sample) >= 2 and -1 not in unique_labels_sample:
                    score = silhouette_score(X_sample, labels_sample)
                    
                    # Store result
                    result = {
                        'eps': eps,
                        'min_samples': min_samples,
                        'n_clusters': n_clusters,
                        'n_noise': n_noise,
                        'noise_ratio': noise_ratio,
                        'silhouette_score': score
                    }
                    results_log.append(result)
                    
                    if score > best_score:
                        best_score = score
                        best_params = {"eps": eps, "min_samples": min_samples}
                        best_n_clusters = n_clusters
                        best_labels = labels.copy()
                        
                        print(f"  ✓ eps={eps:.1f}, min_samples={min_samples:2d} → "
                              f"clusters={n_clusters}, noise={n_noise:4d}({noise_ratio*100:.1f}%), "
                              f"silhouette={score:.3f}")
                        
                    valid_solutions += 1
                        
            except Exception as e:
                continue
    
    # Progress indicator
    if (i + 1) % 5 == 0:
        print(f"    Completed {i+1}/{len(eps_values)} eps values... ({valid_solutions} valid solutions found)")

print(f"\nGrid search completed!")
print(f"Total valid solutions found: {valid_solutions}")

In [None]:

if best_params is None:
    print("⚠️  No suitable parameters found for exactly 3 clusters with <30% noise")
    print("Relaxing constraints to find best available solution...")
    
    # Fallback: find best solution with any number of clusters
    best_score = -1
    for eps in eps_values[::2]:  # Sample fewer values
        for min_samples in min_samples_values[::2]:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(X_dbscan_scaled)
            
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters >= 2:
                try:
                    score = silhouette_score(X_dbscan_scaled[sample_indices], 
                                           labels[sample_indices])
                    if score > best_score:
                        best_score = score
                        best_params = {"eps": eps, "min_samples": min_samples}
                        best_n_clusters = n_clusters
                        best_labels = labels.copy()
                except:
                    continue
    
    print(f"Fallback solution: eps={best_params['eps']}, min_samples={best_params['min_samples']}")


In [None]:

# =====================================================
# FINAL DBSCAN APPLICATION
# =====================================================

print(f"\n" + "="*50)
print("FINAL DBSCAN RESULTS")
print("="*50)

print(f"Best parameters found:")
print(f"  • eps = {best_params['eps']}")
print(f"  • min_samples = {best_params['min_samples']}")
print(f"  • Number of clusters: {best_n_clusters}")
print(f"  • Silhouette score: {best_score:.3f}")

In [None]:

# Apply final DBSCAN with best parameters
dbscan_final = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
final_labels = dbscan_final.fit_predict(X_dbscan_scaled)

# Add DBSCAN results to dataframe
df_dbscan['dbscan_cluster'] = final_labels

In [None]:

# Calculate final statistics
n_clusters_final = len(set(final_labels)) - (1 if -1 in final_labels else 0)
n_noise_final = np.sum(final_labels == -1)
noise_ratio_final = n_noise_final / len(final_labels)

print(f"\nFinal clustering statistics:")
print(f"  • Total data points: {len(final_labels)}")
print(f"  • Number of clusters: {n_clusters_final}")
print(f"  • Noise points: {n_noise_final} ({noise_ratio_final*100:.1f}%)")
print(f"  • Clustered points: {len(final_labels) - n_noise_final} ({(1-noise_ratio_final)*100:.1f}%)")


In [None]:

# Cluster size distribution
print(f"\nCluster size distribution:")
unique_labels = set(final_labels)
for label in sorted(unique_labels):
    if label == -1:
        print(f"  • Noise: {np.sum(final_labels == label)} points")
    else:
        print(f"  • Cluster {label}: {np.sum(final_labels == label)} points")


In [None]:

# =====================================================
# VISUALIZATIONS
# =====================================================

print(f"\nGenerating DBSCAN visualizations...")

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 16))

# Generate colors for clusters
unique_labels = set(final_labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

# Plot 1: UTM coordinates colored by cluster
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Noise points
        col = [0, 0, 0, 1]  # Black
        marker = 'x'
        label = f'Noise ({np.sum(final_labels == k)} pts)'
        size = 15
        alpha = 0.4
    else:
        marker = 'o'
        label = f'Cluster {k} ({np.sum(final_labels == k)} pts)'
        size = 30
        alpha = 0.7

    class_member_mask = (final_labels == k)
    cluster_data = df_dbscan[class_member_mask]
    
    ax1.scatter(cluster_data['utm_x'], cluster_data['utm_y'], 
               c=[col], marker=marker, s=size, alpha=alpha, label=label, edgecolors='white', linewidth=0.5)

ax1.set_xlabel('UTM X (meters)', fontsize=12)
ax1.set_ylabel('UTM Y (meters)', fontsize=12)
ax1.set_title(f'DBSCAN Geographic Distribution\n(eps={best_params["eps"]}, min_samples={best_params["min_samples"]})', 
              fontsize=14, fontweight='bold')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Plot 2: Price vs UTM X colored by cluster
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1]
        marker = 'x'
        size = 15
        alpha = 0.4
    else:
        marker = 'o'
        size = 30
        alpha = 0.7

    class_member_mask = (final_labels == k)
    cluster_data = df_dbscan[class_member_mask]
    
    ax2.scatter(cluster_data['utm_x'], cluster_data['transformable_price'], 
               c=[col], marker=marker, s=size, alpha=alpha, edgecolors='white', linewidth=0.5)

ax2.set_xlabel('UTM X (meters)', fontsize=12)
ax2.set_ylabel('Transformable Price', fontsize=12)
ax2.set_title('DBSCAN: Price vs Geographic Location (X)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

# Plot 3: Price vs UTM Y colored by cluster
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1]
        marker = 'x'
        size = 15
        alpha = 0.4
    else:
        marker = 'o'
        size = 30
        alpha = 0.7

    class_member_mask = (final_labels == k)
    cluster_data = df_dbscan[class_member_mask]
    
    ax3.scatter(cluster_data['utm_y'], cluster_data['transformable_price'], 
               c=[col], marker=marker, s=size, alpha=alpha, edgecolors='white', linewidth=0.5)

ax3.set_xlabel('UTM Y (meters)', fontsize=12)
ax3.set_ylabel('Transformable Price', fontsize=12)
ax3.set_title('DBSCAN: Price vs Geographic Location (Y)', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)

# Plot 4: 3D representation (UTM X vs UTM Y, sized by price)
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1]
        marker = 'x'
        alpha = 0.3
        size_factor = 10
    else:
        marker = 'o'
        alpha = 0.6
        size_factor = 50

    class_member_mask = (final_labels == k)
    cluster_data = df_dbscan[class_member_mask]
    
    # Size points by price (normalized)
    if len(cluster_data) > 0:
        price_sizes = (cluster_data['transformable_price'] - cluster_data['transformable_price'].min() + 1)
        price_sizes = price_sizes / price_sizes.max() * size_factor + 10
        
        ax4.scatter(cluster_data['utm_x'], cluster_data['utm_y'], 
                   c=[col], marker=marker, s=price_sizes, alpha=alpha, 
                   edgecolors='white', linewidth=0.5)

ax4.set_xlabel('UTM X (meters)', fontsize=12)
ax4.set_ylabel('UTM Y (meters)', fontsize=12)
ax4.set_title('DBSCAN: Geographic Distribution\n(Point size = Price)', fontsize=14, fontweight='bold')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
====================================================
# HYPERPARAMETER ANALYSIS EXPLANATION
# =====================================================

print(f"\n" + "="*60)
print("DBSCAN HYPERPARAMETER ANALYSIS")
print("="*60)

print(f"📘 DBSCAN Hyperparameter Effects:")
print(f"\n1. EPS (ε - Epsilon):")
print(f"   • Definition: Maximum distance between two samples to be considered neighbors")
print(f"   • Effect on clustering:")
print(f"     - Smaller eps (e.g., 0.1-0.5): More clusters, tighter groups, more noise")
print(f"     - Larger eps (e.g., 1.0-2.0): Fewer, larger clusters, less noise")
print(f"   • Selected eps: {best_params['eps']}")
print(f"   • Interpretation: Points within {best_params['eps']} standard deviations are neighbors")

print(f"\n2. MIN_SAMPLES:")
print(f"   • Definition: Minimum points required in a neighborhood to form a core point")
print(f"   • Effect on clustering:")
print(f"     - Smaller min_samples (e.g., 5-10): More core points, smaller clusters possible")
print(f"     - Larger min_samples (e.g., 20-50): Fewer core points, denser clusters required")
print(f"   • Selected min_samples: {best_params['min_samples']}")
print(f"   • Interpretation: Need at least {best_params['min_samples']} points to form cluster core")

print(f"\n3. COMBINED EFFECT:")
print(f"   • Our parameters (eps={best_params['eps']}, min_samples={best_params['min_samples']}):")
print(f"     - Created {n_clusters_final} distinct clusters")
print(f"     - Identified {n_noise_final} outlier properties ({noise_ratio_final*100:.1f}%)")
print(f"     - Achieved silhouette score of {best_score:.3f}")

print(f"\n4. BUSINESS INTERPRETATION:")
print(f"   • Clusters represent property groups with similar:")
print(f"     - Geographic proximity (UTM coordinates)")
print(f"     - Price ranges (transformable_price)")
print(f"   • Noise points are unique/outlier properties that don't fit standard patterns")
print(f"   • Useful for recommendation: suggest properties from same cluster")


In [None]:
#===================================================
# CLUSTER ANALYSIS
# =====================================================

print(f"\n" + "="*50)
print("DETAILED CLUSTER ANALYSIS")
print("="*50)

print(f"Cluster characteristics:")
for label in sorted(unique_labels):
    if label == -1:
        cluster_data = df_dbscan[final_labels == label]
        print(f"\n🔴 NOISE POINTS ({len(cluster_data)} properties):")
        print(f"   • These are outlier properties that don't fit into any cluster")
        print(f"   • Average price: {cluster_data['transformable_price'].mean():,.0f}")
        print(f"   • Price range: {cluster_data['transformable_price'].min():,.0f} - {cluster_data['transformable_price'].max():,.0f}")
        print(f"   • Geographic spread:")
        print(f"     - UTM X range: {cluster_data['utm_x'].max() - cluster_data['utm_x'].min():,.0f} meters")
        print(f"     - UTM Y range: {cluster_data['utm_y'].max() - cluster_data['utm_y'].min():,.0f} meters")
    else:
        cluster_data = df_dbscan[final_labels == label]
        print(f"\n🟢 CLUSTER {label} ({len(cluster_data)} properties):")
        print(f"   • Average price: {cluster_data['transformable_price'].mean():,.0f}")
        print(f"   • Price std: {cluster_data['transformable_price'].std():,.0f}")
        print(f"   • Price range: {cluster_data['transformable_price'].min():,.0f} - {cluster_data['transformable_price'].max():,.0f}")
        print(f"   • Geographic center: UTM({cluster_data['utm_x'].mean():.0f}, {cluster_data['utm_y'].mean():.0f})")
        print(f"   • Geographic spread:")
        print(f"     - UTM X range: {cluster_data['utm_x'].max() - cluster_data['utm_x'].min():,.0f} meters")
        print(f"     - UTM Y range: {cluster_data['utm_y'].max() - cluster_data['utm_y'].min():,.0f} meters")


In [None]:

# =====================================================
# PARAMETER SENSITIVITY ANALYSIS
# =====================================================

if len(results_log) > 0:
    print(f"\n" + "="*50)
    print("PARAMETER SENSITIVITY ANALYSIS")
    print("="*50)
    
    # Convert results to DataFrame for analysis
    results_df = pd.DataFrame(results_log)
    
    print(f"Top 5 parameter combinations (by silhouette score):")
    top_results = results_df.nlargest(5, 'silhouette_score')
    print(f"{'Rank':<4} {'eps':<5} {'min_samples':<12} {'clusters':<9} {'noise%':<8} {'silhouette':<11}")
    print("-" * 55)
    for i, (_, row) in enumerate(top_results.iterrows(), 1):
        print(f"{i:<4} {row['eps']:<5.1f} {row['min_samples']:<12} {row['n_clusters']:<9} "
              f"{row['noise_ratio']*100:<8.1f} {row['silhouette_score']:<11.3f}")
    
    # Parameter distribution analysis
    print(f"\nParameter ranges that produced 3 clusters:")
    print(f"  • eps range: {results_df['eps'].min():.1f} - {results_df['eps'].max():.1f}")
    print(f"  • min_samples range: {results_df['min_samples'].min()} - {results_df['min_samples'].max()}")
    print(f"  • Average noise ratio: {results_df['noise_ratio'].mean()*100:.1f}%")
    print(f"  • Average silhouette: {results_df['silhouette_score'].mean():.3f}")


In [None]:
# =====================================================
# SAVE RESULTS
# =====================================================

print(f"\n" + "="*50)
print("SAVING RESULTS")
print("="*50)

# Save DBSCAN results
dbscan_results = {
    'best_params': best_params,
    'final_labels': final_labels,
    'n_clusters': n_clusters_final,
    'n_noise': n_noise_final,
    'silhouette_score': best_score,
    'df_dbscan': df_dbscan,
    'scaler_dbscan': scaler_dbscan
}

# Save to pickle
import pickle
with open('dbscan_results.pkl', 'wb') as f:
    pickle.dump(dbscan_results, f)

# Also save the DataFrame with cluster assignments
df_dbscan.to_pickle('../DataSets/dbscan_clustered_data.pkl')


print(f"✅ DBSCAN results saved to:")
print(f"   • 'dbscan_results.pkl' (complete results)")
print(f"   • 'dbscan_clustered_data.pkl' (clustered dataset)")


In [None]:

print(f"\n🎯 DBSCAN CLUSTERING COMPLETE!")
print(f"   • Successfully created {n_clusters_final} clusters using geographic location and price")
print(f"   • Optimal parameters: eps={best_params['eps']}, min_samples={best_params['min_samples']}")
print(f"   • Quality metric: Silhouette score = {best_score:.3f}")
print(f"   • Practical outcome: {noise_ratio_final*100:.1f}% outliers identified")


In [None]:

print(f"\n📊 For Recommendation System:")
print(f"   • Users can be recommended properties from the same cluster")
print(f"   • Clusters represent similar properties in terms of location and price")
print(f"   • Noise points represent unique properties that might be special deals or unique listings")
