In [241]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [242]:
data = pd.read_csv("../data/subset_dataset_frunza_roxana.csv")

In [243]:
data.isna().sum()

BldgType          0
HouseStyle        0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType      683
MasVnrArea        6
SalePrice         0
dtype: int64

In [244]:
data = data.drop(columns=["MasVnrType"])
data_kmeans = data.copy()

In [245]:
price_categories = ["cheap", "medium", "expensive"]

In [246]:
def categorize_price(price):
    if price < 150000:
        return "cheap"
    elif price < 300000:
        return "medium"
    else:
        return "expensive"

data["PriceCategory"] = data["SalePrice"].apply(categorize_price)
data = data.drop(columns=["SalePrice"])

print(data["PriceCategory"].value_counts())

PriceCategory
medium       601
cheap        476
expensive     91
Name: count, dtype: int64


In [247]:
X = data.drop(columns=["PriceCategory"])
y = data["PriceCategory"]

In [248]:
numeric_columns = X.select_dtypes(include=["int64", "float64"]).columns
categorical_columns = X.select_dtypes(include=["object"]).columns

In [249]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_columns),
    ("cat", categorical_transformer, categorical_columns)
])

In [250]:
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

In [251]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)

In [252]:
y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.756

Classification Report:
               precision    recall  f1-score   support

       cheap       0.78      0.74      0.76        95
   expensive       0.65      0.72      0.68        18
      medium       0.76      0.78      0.77       121

    accuracy                           0.76       234
   macro avg       0.73      0.75      0.74       234
weighted avg       0.76      0.76      0.76       234


Confusion Matrix:
 [[70  0 25]
 [ 0 13  5]
 [20  7 94]]


In [253]:
kmeans = KMeans(n_clusters=2, random_state=42)
data_kmeans["Cluster"] = kmeans.fit_predict(data_kmeans[["SalePrice"]])

cluster_means = data_kmeans.groupby("Cluster")["SalePrice"].mean().sort_values()
label_map = {cluster_means.index[0]: "cheap", cluster_means.index[1]: "expensive"}

data_kmeans["PriceCategory"] = data_kmeans["Cluster"].map(label_map)

data_kmeans = data_kmeans.drop(columns=["Cluster", "SalePrice"])

In [254]:
X_kmeans = data_kmeans.drop(columns=["PriceCategory"])
y_kmeans = data_kmeans["PriceCategory"]

numeric_cols = X_kmeans.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X_kmeans.select_dtypes(include=["object"]).columns

In [255]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [256]:
pipe_kmeans = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

In [257]:
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(
    X_kmeans, y_kmeans, test_size=0.2, random_state=42, stratify=y_kmeans
)

pipe_kmeans.fit(X_train_k, y_train_k)

In [258]:
y_pred_k = pipe_kmeans.predict(X_test_k)
accuracy_kmeans = accuracy_score(y_test_k, y_pred_k)

print(f"Accuracy for KMeans categories): {accuracy_kmeans:.3f}")

Accuracy for KMeans categories): 0.825


In [259]:
print(f"3 class categories: {accuracy:.3f}")
print(f"2 clusters with KMeans: {accuracy_kmeans:.3f}")

3 class categories: 0.756
2 clusters with KMeans: 0.825


    The model with KMeans categories (2 clusters) had a higher accuracy (0.825) than the model with manual 3 classes (0.756), this means the KMeans approach worked better.

In [260]:
pipe_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

In [261]:
pipe_rf.fit(X_train, y_train)

In [262]:
y_pred_rf = pipe_rf.predict(X_test)

In [263]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.3f}")

Random Forest Accuracy: 0.778


In [264]:
pipe_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    pipe_rf,
    param_grid,
    cv=5,               
    scoring="accuracy",  
    n_jobs=-1,           
)

In [265]:
grid_search.fit(X_train, y_train)

In [266]:
print(grid_search.best_params_)

{'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}


In [267]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

In [268]:
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"\nOptimized Random Forest Accuracy: {accuracy_best:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Optimized Random Forest Accuracy: 0.765

Classification Report:
               precision    recall  f1-score   support

       cheap       0.76      0.81      0.79        95
   expensive       0.73      0.44      0.55        18
      medium       0.77      0.78      0.77       121

    accuracy                           0.76       234
   macro avg       0.75      0.68      0.70       234
weighted avg       0.76      0.76      0.76       234


Confusion Matrix:
 [[77  0 18]
 [ 0  8 10]
 [24  3 94]]


In [286]:
pipe_xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        eval_metric="mlogloss"
    ))
])

pipe_xgb.fit(X_train, y_train)
y_pred_xgb = pipe_xgb.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb, average="weighted")

print(f" Accuracy: {acc_xgb:.3f}")
print(classification_report(y_test, y_pred_xgb))

 Accuracy: 0.991
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       107
           1       0.98      1.00      0.99       127

    accuracy                           0.99       234
   macro avg       0.99      0.99      0.99       234
weighted avg       0.99      0.99      0.99       234



In [285]:
print(f"Decision Tree (3 Price Categories): {accuracy:.3f}")
print(f"2 Clusters - KMeans: {accuracy_kmeans:.3f}")
print(f"Random Forest Accuracy: {accuracy_rf:.3f}")
print(f"Optimized Random Forest Accuracy with GridSearchCV: {accuracy_best:.3f}")
print(f"XGBoost: {acc_xgb:.3f}")

Decision Tree (3 Price Categories): 0.756
2 Clusters - KMeans: 0.825
Random Forest Accuracy: 0.778
Optimized Random Forest Accuracy with GridSearchCV: 0.765
XGBoost: 0.991


    Using K-Means with k = 2 turned out to work much better than the three manual price categories. With only two clusters, the data splits into two very clear groups, which makes it much easier for the models to learn the difference between them. 
    
    Because the classes are clearer, the accuracy becomes much higher—XGBoost even reached 0.991. So in this case, K-Means with k = 2 gives cleaner labels than the cheap/medium/expensive split, and the models perform better because of it.

In [270]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("../data/subset_classification_frunza_roxana.csv")

y = df["Label"]
X = df.drop(columns=["Label"])

In [271]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

In [272]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [273]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [274]:
pipeline_logistic_regression = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

pipeline_logistic_regression.fit(X_train, y_train)
y_pred_lr = pipeline_logistic_regression.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Accuracy: 1.0


In [275]:
pipeline_decision_tree = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

pipeline_decision_tree .fit(X_train, y_train)
y_pred_dt = pipeline_decision_tree .predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))

Accuracy: 0.9829059829059829


In [276]:
pipeline_random_forest = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

pipeline_random_forest.fit(X_train, y_train)
y_pred_rf = pipeline_random_forest.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Accuracy: 0.9914529914529915


In [277]:
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_lr):.3f}")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt):.3f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")

Logistic Regression Accuracy: 1.000
Decision Tree Accuracy: 0.983
Random Forest Accuracy: 0.991


    The results obtained on the new dataset show improvement in model performance across all classifiers. Logistic Regression has a perfect accuracy score of 1.00, followed by Random Forest (0.991) and Decision Tree (0.983). These values indicate that the separation between the classes formed through K-Means, allowing the classifiers to learn simple and highly effective decision boundaries. 
    In comparison, in the previous lab where the labels were manually assigned based on price ranges (cheap, medium, expensive), the models achieved much lower accuracies, typically between 0.75 and 0.78.
    The main reason for this difference lies in the labels. The manual three category price split relied on arbitrary, which resulted in overlapping and less defined classes. This made the classification task more difficult for all models. On the other hand, the labels produced by K-Means reflect natural structures in the data, grouping together homes that are genuinely similar based on multiple numerical features. This internal consistency of the clusters makes them far easier for the models to distinguish.

    Overall, the classifiers distinguish the K-Means based groups far more effectively than the price threshold categories used previously. This demonstrates that integrating clustering with supervised learning can produce more reliable classifications and a stronger overall performance.