In [2]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.metrics import silhouette_score

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


In [3]:
df = pd.read_csv("subset_classification_cruceru_irina.csv")
df.head()

Unnamed: 0,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,Cluster,Kmeans_with_Sillhoutte,Avg_SalePrice
0,CBlock,TA,TA,No,Rec,922,Unf,0,392,1314,...,TA,Y,SBrkr,1314,0,0,1314,1,1,155454.590698
1,PConc,Gd,TA,No,Unf,0,Unf,0,799,799,...,Gd,Y,SBrkr,799,772,0,1571,3,1,189096.763636
2,CBlock,Fa,TA,No,Unf,0,Unf,0,796,796,...,Gd,Y,FuseA,796,0,0,796,4,1,118470.182796
3,BrkTil,Gd,TA,No,ALQ,569,Unf,0,162,731,...,Ex,Y,SBrkr,981,787,0,1768,3,1,189096.763636
4,BrkTil,TA,TA,No,LwQ,218,Unf,0,808,1026,...,TA,Y,SBrkr,1026,665,0,1691,1,1,155454.590698


In [5]:
X = df.drop("Cluster", axis=1)
y = df["Cluster"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
numeric_columns = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_columns = df.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col != "Cluster"]

In [12]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_columns),
    ("cat", categorical_transformer, categorical_columns)
])

In [13]:
logreg_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=2000))
])

logreg_pipe.fit(X_train, y_train)
y_pred = logreg_pipe.predict(X_test)

print("LogReg Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

LogReg Accuracy: 0.9914529914529915
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        23
           1       0.98      1.00      0.99        43
           2       1.00      0.97      0.99        39
           3       1.00      0.98      0.99        55
           4       1.00      1.00      1.00        19
           5       1.00      1.00      1.00        55

    accuracy                           0.99       234
   macro avg       0.99      0.99      0.99       234
weighted avg       0.99      0.99      0.99       234



In [14]:
tree_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

tree_pipe.fit(X_train, y_train)
y_pred = tree_pipe.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Decision Tree Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        43
           2       1.00      1.00      1.00        39
           3       1.00      1.00      1.00        55
           4       1.00      1.00      1.00        19
           5       1.00      1.00      1.00        55

    accuracy                           1.00       234
   macro avg       1.00      1.00      1.00       234
weighted avg       1.00      1.00      1.00       234



In [15]:
rf_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        43
           2       1.00      1.00      1.00        39
           3       1.00      1.00      1.00        55
           4       1.00      1.00      1.00        19
           5       1.00      1.00      1.00        55

    accuracy                           1.00       234
   macro avg       1.00      1.00      1.00       234
weighted avg       1.00      1.00      1.00       234



Because the target labels come from well-separated K-Means clusters, all three classifiers achieve extremely high performance. Logistic Regression already reaches 99% accuracy, showing near-linear separability, while Decision Tree and Random Forest reach 100% accuracy.

This does not necessarily indicate strong real-world predictive reliability. Since the target labels were generated using K-Means on the same features, the classifiers essentially learn to reproduce the clustering structure rather than generalize to new, unseen data.”