In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pyproj
import seaborn as sns
from matplotlib.colors import ListedColormap

In [3]:
divar_df = pd.read_pickle('../DataSets/clean_divar_data.pkl')

In [None]:
df = raw_df.copy()

In [None]:
selected_features = [
    'building_size',      # Size preference
    'rooms_count',        # Room requirement
    'construction_year',  # Age preference
    'has_parking',        # Amenity preference
    'has_elevator',       # Accessibility preference
    'floor'               # Floor preference
]

In [None]:

# Prepare clustering data
X_clustering = df[selected_features].copy()

# Standardize features for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering_imputed)


In [None]:

# Apply K-means with 10 clusters
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['kmeans_cluster'] = kmeans_labels

print(f"K-means clustering completed. Number of clusters: {len(np.unique(kmeans_labels))}")
print(f"Cluster distribution:")
for i in range(10):
    count = np.sum(kmeans_labels == i)
    print(f"Cluster {i}: {count} properties ({count/len(df)*100:.1f}%)")


In [None]:

# Create 3D scatter plot for K-means results
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111, projection='3d')

# Color palette for clusters
colors = plt.cm.tab10(np.linspace(0, 1, 10))

# Plot each cluster
for i in range(10):
    mask = kmeans_labels == i
    ax.scatter(df.loc[mask, 'utm_x'], df.loc[mask, 'utm_y'], df.loc[mask, 'transformable_price'],
              c=[colors[i]], label=f'Cluster {i}', alpha=0.6, s=20)

# Plot cluster centers in original feature space, but project to price/location space
# We need to inverse transform the centers and then map to price/location
centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centers_original, columns=selected_features)

# For visualization, we'll use the mean price and location of each cluster
for i in range(10):
    mask = kmeans_labels == i
    center_x = df.loc[mask, 'utm_x'].mean()
    center_y = df.loc[mask, 'utm_y'].mean()
    center_price = df.loc[mask, 'transformable_price'].mean()
    
    ax.scatter([center_x], [center_y], [center_price], 
              c='red', marker='x', s=200, linewidths=3,
              label='Centers' if i == 0 else "")

ax.set_xlabel('UTM X (meters)', fontsize=12)
ax.set_ylabel('UTM Y (meters)', fontsize=12)
ax.set_zlabel('Transformable Price (Toman)', fontsize=12)
ax.set_title('K-means Clustering Results (10 clusters)\nPrice vs UTM Coordinates', fontsize=14)

# Create custom legend
handles, labels = ax.get_legend_handles_labels()
# Remove duplicate "Centers" entries
unique_labels = []
unique_handles = []
for handle, label in zip(handles, labels):
    if label not in unique_labels:
        unique_labels.append(label)
        unique_handles.append(handle)

ax.legend(unique_handles, unique_labels, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()