# import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
from pyproj import Transformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv("Divar.csv")

In [None]:

data.loc[
    data['construction_year'].astype(str).str.contains('قبل از', na=False),
    'construction_year'
] = 1369

#created_at_month to dateTime
data['created_at_month']=pd.to_datetime(data['created_at_month'],errors='coerce')
data['year_month'] = data['created_at_month'].dt.to_period('M')

In [None]:
def persian_to_english_numbers(text):
    if not isinstance(text, str):
        return text

    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    arabic_digits  = '٠١٢٣٤٥٦٧٨٩'
    english_digits = '0123456789'

    translation_table = str.maketrans(
        persian_digits + arabic_digits,
        english_digits * 2
    )

    return text.translate(translation_table)

In [None]:
data=data.applymap(persian_to_english_numbers)

In [None]:
data['created_at_month']=pd.to_datetime(data['created_at_month'],errors='coerce')
data['year_month'] = data['created_at_month'].dt.to_period('M')

In [None]:
# ---------- 3. تبدیل rooms_count به عدد ----------
def parse_rooms(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().lower()
    if 'بدون' in s or 'صفر' in s:
        return 0
    if 'یک' in s:
        return 1
    if 'دو' in s:
        return 2
    if 'سه' in s:
        return 3
    if 'چهار' in s:
        return 4
    if 'پنج' in s or 'بیشتر' in s:
        return 5
    nums = re.findall(r'\d+', s)
    return int(nums[0]) if nums else np.nan

# Part 1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer


In [None]:
df=data.copy()

In [None]:
df['rooms_num'] = df['rooms_count'].apply(parse_rooms).fillna(2)

In [None]:
features = [
    "price_value",
    "building_size",
    "rooms_num",
    "construction_year",
    "location_latitude",
    "location_longitude",
    "city_slug"
]

df_model = df[features].copy()


In [None]:
df_model = df_model.dropna(
    subset=["location_latitude","location_longitude","city_slug"]
).reset_index(drop=True)

In [None]:
num_cols = [
    "price_value",
    "building_size",
    "rooms_num",
    "construction_year"
]

geo_features=[
    "location_latitude",
    "location_longitude"
]


In [None]:
for col in num_cols:
    df_model[col] = pd.to_numeric(df_model[col], errors="coerce")

# df_model = df_model.dropna()


In [None]:
for col in geo_features:
    df_model[col] = pd.to_numeric(df_model[col], errors="coerce")

# df_model = df_model.dropna()

In [None]:
df_model_sell = df_model[
    (df_model['price_value']>0) &
    (df_model['price_value']<200_000_000_000) &
    (df_model['building_size'].notna()) &
    (df_model['building_size']>5) &
    (df_model['building_size']<10000) &
    (df_model['location_latitude'].between(25, 40)) &
    (df_model['location_longitude'].between(44, 63))
].copy()

In [None]:
city_mean_price = (
    df_model_sell.groupby("city_slug")["price_value"]
            .mean()
)

df_model_sell["city_encoded"] = df_model_sell["city_slug"].map(city_mean_price)


In [None]:


imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_num = imputer.fit_transform(df_model_sell[num_cols])
X_num = scaler.fit_transform(X_num)


X_geo = scaler.fit_transform(df_model_sell[geo_features])


city_encoded = imputer.fit_transform(
    df_model_sell[["city_encoded"]]
)
city_encoded = scaler.fit_transform(city_encoded)


In [None]:
weights = np.array([3, 1.5, 1, 1])  
X_weighted = X_num * weights

In [None]:
city_weight = 2

X_final = np.hstack([
    X_num,
    X_geo,
    city_encoded * city_weight
])


In [None]:
kmeans = KMeans(
    n_clusters=10,
    random_state=42,
    n_init=10
)

df_model_sell["cluster"] = kmeans.fit_predict(X_final)


In [None]:
R = 6378137  # شعاع زمین (متر)

lat_rad = np.deg2rad(df_model_sell["location_latitude"])
lon_rad = np.deg2rad(df_model_sell["location_longitude"])

lat_mean = lat_rad.mean()

df_model_sell["utm_x"] = R * lon_rad * np.cos(lat_mean)
df_model_sell["utm_y"] = R * lat_rad


In [None]:
centroids_geo = (
    df_model_sell
    .groupby("cluster")[["location_latitude", "location_longitude"]]
    .mean()
    .reset_index()
)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
cmap = plt.get_cmap("tab10")
scatter = plt.scatter(
    df_model_sell["location_longitude"],
    df_model_sell["location_latitude"],
    c=df_model_sell["cluster"],
    cmap="tab10",
    s=12,
    alpha=0.6
)

for _, row in centroids_geo.iterrows():
    cluster_id = int(row["cluster"])
    plt.scatter(
        row["location_longitude"],
        row["location_latitude"],
        color=cmap(cluster_id),
        s=250,
        marker="X",
        edgecolor="black",
        linewidth=1.5,
        zorder=5
    )


# Legend خوشه‌ها
legend1 = plt.legend(
    *scatter.legend_elements(),
    title="Clusters",
    loc="upper right"
)
plt.gca().add_artist(legend1)

# Legend مراکز
plt.legend(loc="lower left")

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("KMeans Clustering on Geographical Coordinates")

plt.grid(alpha=0.3)
plt.show()


In [None]:
pd.options.display.float_format = '{:.2f}'.format
cluster_analysis = (
    df_model_sell
    .groupby("cluster")
    .agg(
        count=("price_value", "count"),
        avg_price=("price_value", "mean"),
        median_price=("price_value", "median"),
        avg_size=("building_size", "mean"),
        avg_rooms=("rooms_num", "mean"),
        avg_year=("construction_year", "mean"),
        avg_lat=("location_latitude", "mean"),
        avg_lon=("location_longitude", "mean")
    )
    .round(2)
    .sort_values("avg_price")
)

print(cluster_analysis)


# just residental

In [None]:
df=data.copy()
df=df[df['cat2_slug']=='residential-sell']

In [None]:
df['rooms_num'] = df['rooms_count'].apply(parse_rooms).fillna(2)

In [None]:
amenity_cols = [
    'has_balcony',
    'has_elevator',
    'has_warehouse',
    'has_parking',
    'has_security_guard',
    'has_barbecue',
    'has_pool',
    'has_jacuzzi',
    'has_sauna'
]


In [None]:
features = [
    "price_value",
    "building_size",
    "rooms_num",
    "construction_year",
    "location_latitude",
    "location_longitude",
    "city_slug"
]+amenity_cols

df_model = df[features].copy()

In [None]:
num_cols = [
    "price_value",
    "building_size",
    "rooms_num",
    "construction_year"
]

geo_features=[
    "location_latitude",
    "location_longitude"
]

for col in num_cols:
    df_model[col] = pd.to_numeric(df_model[col], errors="coerce")

# df_model = df_model.dropna()

for col in geo_features:
    df_model[col] = pd.to_numeric(df_model[col], errors="coerce")

# df_model = df_model.dropna()

In [None]:
df_model = df_model.dropna(
    subset=["location_latitude","location_longitude","city_slug"]
).reset_index(drop=True)

In [None]:
df_model_sell = df_model[
    (df_model['price_value']>0) &
    (df_model['price_value']<200_000_000_000) &
    (df_model['building_size'].notna()) &
    (df_model['building_size']>5) &
    (df_model['building_size']<10000) &
    (df_model['location_latitude'].between(25, 40)) &
    (df_model['location_longitude'].between(44, 63))
].copy()
city_mean_price = (
    df_model_sell.groupby("city_slug")["price_value"]
            .mean()
)

df_model_sell["city_encoded"] = df_model_sell["city_slug"].map(city_mean_price)


In [None]:


imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_num = imputer.fit_transform(df_model_sell[num_cols])
X_num = scaler.fit_transform(X_num)


X_geo = scaler.fit_transform(df_model_sell[geo_features])


city_encoded = imputer.fit_transform(
    df_model_sell[["city_encoded"]]
)
city_encoded = scaler.fit_transform(city_encoded)

weights = np.array([3, 1.5, 1, 1])  
X_weighted = X_num * weights
df_model_sell[amenity_cols] = (
    df_model_sell[amenity_cols]
    .replace({True: 1, False: 0, "yes": 1, "no": 0,'true':1,'false':0,'unselect':0})
)

amenity_imputer = SimpleImputer(strategy="most_frequent")

X_amenities = amenity_imputer.fit_transform(
    df_model_sell[amenity_cols]
)

X_amenities = scaler.fit_transform(X_amenities)

amenity_weight = 0.2   # قابل تنظیم
X_amenities = X_amenities * amenity_weight

city_weight = 2

X_final = np.hstack([
    X_num,
    X_geo,
    X_amenities,
    city_encoded * city_weight
])

kmeans = KMeans(
    n_clusters=10,
    random_state=42,
    n_init=10
)

df_model_sell["cluster"] = kmeans.fit_predict(X_final)

R = 6378137  # شعاع زمین (متر)

lat_rad = np.deg2rad(df_model_sell["location_latitude"])
lon_rad = np.deg2rad(df_model_sell["location_longitude"])

lat_mean = lat_rad.mean()

df_model_sell["utm_x"] = R * lon_rad * np.cos(lat_mean)
df_model_sell["utm_y"] = R * lat_rad

centroids_geo = (
    df_model_sell
    .groupby("cluster")[["location_latitude", "location_longitude"]]
    .mean()
    .reset_index()
)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
cmap = plt.get_cmap("tab10")
scatter = plt.scatter(
    df_model_sell["location_longitude"],
    df_model_sell["location_latitude"],
    c=df_model_sell["cluster"],
    cmap="tab10",
    s=12,
    alpha=0.6
)

# # رسم مراکز خوشه‌ها
for _, row in centroids_geo.iterrows():
    cluster_id = int(row["cluster"])
    plt.scatter(
        row["location_longitude"],
        row["location_latitude"],
        color=cmap(cluster_id),
        s=250,
        marker="X",
        edgecolor="black",
        linewidth=1.5,
        zorder=5
    )


# Legend خوشه‌ها
legend1 = plt.legend(
    *scatter.legend_elements(),
    title="Clusters",
    loc="upper right"
)
plt.gca().add_artist(legend1)

# Legend مراکز
plt.legend(loc="lower left")

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("KMeans Clustering on Geographical Coordinates")

plt.grid(alpha=0.3)
plt.show()


In [None]:
pd.options.display.float_format = '{:.2f}'.format
df_model_sell["price_million"] = df_model_sell["price_value"] / 1_000_000
cluster_analysis = (
    df_model_sell
    .groupby("cluster")
    .agg(
        count=("price_value", "count"),
        avg_price=("price_value", "mean"),
        median_price=("price_value", "median"),
        avg_size=("building_size", "mean"),
        avg_rooms=("rooms_num", "mean"),
        avg_year=("construction_year", "mean"),
        avg_lat=("location_latitude", "mean"),
        avg_lon=("location_longitude", "mean")
    )
    .round(2)
    .sort_values("avg_price")
)

print(cluster_analysis)


amenity_summary = (
    df_model_sell
    .groupby('cluster')[amenity_cols]
    .mean()
    .round(2)
)

print(amenity_summary)

# Finding best K

In [None]:
from sklearn.cluster import KMeans

wcss = []
K_range = range(1, 21)

for k in K_range:
    kmeans = KMeans(
        n_clusters=k,
        random_state=42,
        n_init=10
    )
    kmeans.fit(X_final)
    wcss.append(kmeans.inertia_)  # WCSS


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(K_range, wcss, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
plt.title("Elbow Method for Optimal k")
plt.grid(True)
plt.show()


In [None]:
wcss_diff = np.diff(wcss)
wcss_diff2 = np.diff(wcss_diff)

for i in range(len(wcss_diff2)):
    print(f"k={i+2} → curvature={wcss_diff2[i]:.2f}")


# PartIII-DBSCAN


In [None]:
import pyproj
import numpy as np
from sklearn.cluster import DBSCAN
df=data.copy()
df = df.dropna(
    subset=["location_latitude","location_longitude","transformable_price"]
).reset_index(drop=True)
# پروجکشن WGS84 -> UTM Zone 39N (مناسب ایران)
proj_utm = pyproj.Proj(proj="utm", zone=39, ellps="WGS84")

utm_x, utm_y = proj_utm(
    df["location_longitude"].values,
    df["location_latitude"].values
)

df["utm_x"] = utm_x
df["utm_y"] = utm_y


In [None]:
df_sample = df.sample(
    n=10000,        # یا 30000 بسته به RAM
    random_state=42
)


In [None]:
X_db = df_sample[["utm_x", "utm_y", "transformable_price"]]

X_db_scaled = StandardScaler().fit_transform(X_db)

dbscan = DBSCAN(
    eps=0.6,
    min_samples=50,
    algorithm="ball_tree"   # مهم
)

df_sample["dbscan_cluster"] = dbscan.fit_predict(X_db_scaled)


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))

for c in sorted(df_sample["dbscan_cluster"].unique()):
    subset = df_sample[df_sample["dbscan_cluster"] == c]
    label = "Noise" if c == -1 else f"Cluster {c}"
    
    plt.scatter(
        subset["utm_x"],
        subset["utm_y"],
        s=12,
        alpha=0.6,
        label=label
    )

plt.xlabel("UTM X")
plt.ylabel("UTM Y")
plt.title("DBSCAN Clustering (Sampled Data)")
plt.legend()
plt.show()
