In [91]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OneHotEncoder

In [92]:
data = pd.read_csv("../house_prices_selection.csv")

In [93]:
if "SalePrice" in data.columns:
    data = data.drop(columns=["SalePrice"])

num_cols = data.select_dtypes(include=["int64", "float64"]).columns
cat_cols = data.select_dtypes(include=["object"]).columns

In [94]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [95]:
kmeans_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("kmeans", KMeans(n_clusters=4, random_state=42))
])

In [96]:
kmeans_pipeline.fit(data)

In [97]:
cluster_labels = kmeans_pipeline.named_steps["kmeans"].labels_
data["ClusterLabel"] = cluster_labels

In [98]:
X = data.drop(columns=["ClusterLabel"])
y = data["ClusterLabel"]

In [99]:
X_processed = preprocessor.fit_transform(X)

In [100]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

clf = DecisionTreeClassifier(random_state=42)

In [101]:
clf.fit(X_train, y_train)

In [102]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.3f}")

Decision Tree Accuracy: 0.955


In [103]:
data.to_csv("../data/cluster_label_house_prices.csv", index=False)

In [104]:
y = data["ClusterLabel"]
X = data.drop(columns=["ClusterLabel"])

num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(exclude=np.number).columns

In [105]:
preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_cols),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

    LOGISTIC REGRESSION + DECISION TREE + KNN

In [106]:
clf1 = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(max_iter=2000)),
        ("dt", DecisionTreeClassifier()),
        ("knn", KNeighborsClassifier())
    ],
    voting="hard"
)

pipeline1 = Pipeline([
    ("preprocess", preprocess),
    ("model", clf1)
])

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
}

In [107]:
pipeline_logistic_regression = Pipeline([
    ("preprocess", preprocess),
    ("model", models["LogisticRegression"])
])
scores_logistic_regression = cross_val_score(pipeline_logistic_regression, X, y, cv=5, scoring="accuracy")

pipeline_decision_tree = Pipeline([
    ("preprocess", preprocess),
    ("model", models["DecisionTree"])
])
scores_decision_tree = cross_val_score(pipeline_decision_tree, X, y, cv=5, scoring="accuracy")

pipeline_knn = Pipeline([
    ("preprocess", preprocess),
    ("model", models["KNN"])
])
scores_knn = cross_val_score(pipeline_knn, X, y, cv=5, scoring="accuracy")

scores1 = cross_val_score(pipeline1, X, y, cv=5, scoring="accuracy")
print(f"VotingClassifier Logistic Regression + Decision Tree + KNN: {scores1.mean():.3f}")

VotingClassifier Logistic Regression + Decision Tree + KNN: 0.975


    RANDOM FOREST + GRADIENT BOOSTING + SVM

In [108]:
clf2 = VotingClassifier(
    estimators=[
        ("rf", RandomForestClassifier()),
        ("gb", GradientBoostingClassifier()),
        ("svm", SVC())
    ],
    voting="hard"
)

pipeline2 = Pipeline([
    ("preprocess", preprocess),
    ("model", clf2)
])

models2 = {
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(),
}

In [109]:
pipe_random_forest = Pipeline([
    ("preprocess", preprocess),
    ("model", models2["RandomForest"])
])
scores_rf = cross_val_score(pipe_random_forest, X, y, cv=5, scoring="accuracy")

pipe_gradient_boosting = Pipeline([
    ("preprocess", preprocess),
    ("model", models2["GradientBoosting"])
])
scores_gb = cross_val_score(pipe_gradient_boosting, X, y, cv=5, scoring="accuracy")

pipe_svm = Pipeline([
    ("preprocess", preprocess),
    ("model", models2["SVM"])
])
scores_svm = cross_val_score(pipe_svm, X, y, cv=5, scoring="accuracy")

scores2 = cross_val_score(pipeline2, X, y, cv=5, scoring="accuracy")
print(f"VotingClassifier Random Forest + Gradient Boosting + SVM: {scores2.mean():.3f}")

VotingClassifier Random Forest + Gradient Boosting + SVM: 0.968


    The Logistic Regression + Decision Tree + KNN VotingClassifier performed best with an accuracy of 0.977, slightly higher than the Random Forest + Gradient Boosting + SVM VotingClassifier combination 0.968.

    Its better performance is likely due to the higher diversity of the three models, which complement each other well.