In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import json
from pathlib import Path

# Load properties from JSON file
with open("../data/properties.json", "r", encoding="utf-8") as f:
    properties = json.load(f)

# Remove duplicates by ID
unique_props = {}
for prop in properties:
    prop_id = prop.get("id")
    if prop_id:
        unique_props[prop_id] = prop

df = pd.DataFrame(unique_props.values())

# Fill missing values
for column in df.columns:
    if df[column].dtype == "object":
        df[column] = df[column].fillna("")
    elif pd.api.types.is_numeric_dtype(df[column]):
        df[column] = df[column].fillna(0)
    else:
        df[column] = df[column].fillna("unknown")

# --- TEXT CLUSTERING ---
# Select or combine text columns
text_columns = df.select_dtypes(include=["object", "string"]).columns
if "description" in df.columns:
    text_column = "description"
elif len(text_columns) > 0:
    text_column = text_columns[0]
else:
    df["combined_text"] = df.astype(str).agg(" ".join, axis=1)
    text_column = "combined_text"

# TF-IDF vectorization and KMeans clustering
vectorizer = TfidfVectorizer(stop_words="english", max_features=100)
X_text = vectorizer.fit_transform(df[text_column])
kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
kmeans.fit(X_text)
df["cluster"] = kmeans.labels_

# Save text-based models
Path("../models").mkdir(parents=True, exist_ok=True)
joblib.dump(vectorizer, "../models/vectorizer.joblib")
joblib.dump(kmeans, "../models/cluster_model.joblib")

# --- KNN on Numeric Features ---
numeric_columns = df.select_dtypes(include=["number"]).columns
X_numeric = df[numeric_columns]

# Standardize + KNN pipeline
knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=5))
])

# Use clusters as labels for supervised KNN training
knn_pipeline.fit(X_numeric, df["cluster"])

# Save KNN pipeline
joblib.dump(knn_pipeline, "../models/knn_model.joblib")

print("‚úÖ Model training complete using properties.json.")

print("üîç Preview of clustered properties (with assigned cluster labels):")
print(df[["id", text_column, "cluster"]].head(10).to_string(index=False))

print("\nüìä Cluster distribution (how many properties in each cluster):")
print(df["cluster"].value_counts().sort_index())

sample_X = df[numeric_columns].head(5)
predicted_clusters = knn_pipeline.predict(sample_X)
print("\nü§ñ KNN predicted clusters for first 5 entries:")
print(predicted_clusters)

# Save clustered data to CSV and JSON
df.to_csv("../data/properties_clustered.csv", index=False)

print("üíæ Clustered data saved to:")
print(" - ../data/properties_clustered.csv")
print(" - ../data/properties_clustered.json")
#test
columns_to_save = ["id", text_column, "cluster"] + list(numeric_columns)
df[columns_to_save].to_csv("../data/properties_clustered_simple.csv", index=False)

‚úÖ Model training complete using properties.json.
üîç Preview of clustered properties (with assigned cluster labels):
    id                   address  cluster
   367       463 Conservatory Dr        4
163443   463 Conservatory Drive         4
   378            311 Janette St        4
130023       311 Janette Street         2
  2782              4056 Bath Rd        4
  2783     786 HIGH GATE PARK Dr        4
138739 786 HIGH GATE PARK Drive         4
  2763            784 Downing St        4
146595       784 Downing Street         2
   163          593 Roosevelt Dr        4

üìä Cluster distribution (how many properties in each cluster):
cluster
0      2
1      3
2     17
3      2
4    122
Name: count, dtype: int64

ü§ñ KNN predicted clusters for first 5 entries:
[4 4 4 2 4]
üíæ Clustered data saved to:
 - ../data/properties_clustered.csv
 - ../data/properties_clustered.json
