In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import joblib

# Load the data from JSON file
df = pd.read_json("../data/properties_simple.json")

# Fill missing values dynamically
for column in df.columns:
    if df[column].dtype == "object":
        df[column] = df[column].fillna("")
    elif pd.api.types.is_numeric_dtype(df[column]):
        df[column] = df[column].fillna(0)
    else:
        df[column] = df[column].fillna("unknown")

# Try to select a suitable text column
text_columns = df.select_dtypes(include=["object", "string"]).columns
text_column = None

if "description" in df.columns:
    text_column = "description"
    
elif len(text_columns) > 0:
    text_column = text_columns[0]
else:
    # Fallback: combine all columns into one string column
    print("⚠️ No text column found. Falling back to using all columns as text.")
    df["combined_text"] = df.astype(str).agg(" ".join, axis=1)
    text_column = "combined_text"

# Vectorize the selected text column
vectorizer = TfidfVectorizer(stop_words="english", max_features=100)
X = vectorizer.fit_transform(df[text_column])

# Cluster the properties
kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
kmeans.fit(X)

# Add cluster labels to DataFrame
df["cluster"] = kmeans.labels_

# Save models
joblib.dump(vectorizer, "../models/vectorizer.joblib")
joblib.dump(kmeans, "../models/cluster_model.joblib")

print("✅ Training complete. Clustered data:")
print(df.to_string(index=False))



✅ Training complete. Clustered data:
 id  price  area  bedrooms  bathrooms  pool           title  balconies  cluster
  1 100000  1000         2          1     0 Beautiful House        0.0        1
  3 100000  1000         2          1     0                        0.0        0
  5 100000  1000         2          1     0                        0.0        0
  2 100000  1000         2          1     0                        0.0        0
  4 100000  1000         2          1     0                        0.0        0
  6 100000  1000         2          1     0                        1.0        0
  7 100000  1000         2          1     0                        2.0        0
  8 100000  1000         2          1     0                        0.0        0


  return fit_method(estimator, *args, **kwargs)
