In [55]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [56]:
data = pd.read_csv("../house_prices_selection.csv")

In [57]:
if "SalePrice" in data.columns:
    data = data.drop(columns=["SalePrice"])

num_cols = data.select_dtypes(include=["int64", "float64"]).columns
cat_cols = data.select_dtypes(include=["object"]).columns

In [58]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [59]:
kmeans_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("kmeans", KMeans(n_clusters=4, random_state=42))
])

In [60]:
kmeans_pipeline.fit(data)

In [61]:
cluster_labels = kmeans_pipeline.named_steps["kmeans"].labels_
data["ClusterLabel"] = cluster_labels

In [62]:
X = data.drop(columns=["ClusterLabel"])
y = data["ClusterLabel"]

In [63]:
X_processed = preprocessor.fit_transform(X)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

clf = DecisionTreeClassifier(random_state=42)

In [65]:
clf.fit(X_train, y_train)

In [66]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.3f}")

Decision Tree Accuracy: 0.955


In [67]:
data.to_csv("../data/cluster_label_house_prices.csv", index=False)