In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [32]:
data = pd.read_csv("../data/subset_dataset_frunza_roxana.csv")

In [33]:
data.isna().sum()

BldgType          0
HouseStyle        0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType      683
MasVnrArea        6
SalePrice         0
dtype: int64

In [34]:
data = data.drop(columns=["MasVnrType"])
data_kmeans = data.copy()

In [35]:
price_categories = ["cheap", "medium", "expensive"]

In [36]:
def categorize_price(price):
    if price < 150000:
        return "cheap"
    elif price < 300000:
        return "medium"
    else:
        return "expensive"

data["PriceCategory"] = data["SalePrice"].apply(categorize_price)
data = data.drop(columns=["SalePrice"])

print(data["PriceCategory"].value_counts())

PriceCategory
medium       601
cheap        476
expensive     91
Name: count, dtype: int64


In [37]:
X = data.drop(columns=["PriceCategory"])
y = data["PriceCategory"]

In [38]:
numeric_columns = X.select_dtypes(include=["int64", "float64"]).columns
categorical_columns = X.select_dtypes(include=["object"]).columns

In [39]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_columns),
    ("cat", categorical_transformer, categorical_columns)
])

In [40]:
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)

In [42]:
y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.756

Classification Report:
               precision    recall  f1-score   support

       cheap       0.78      0.74      0.76        95
   expensive       0.65      0.72      0.68        18
      medium       0.76      0.78      0.77       121

    accuracy                           0.76       234
   macro avg       0.73      0.75      0.74       234
weighted avg       0.76      0.76      0.76       234


Confusion Matrix:
 [[70  0 25]
 [ 0 13  5]
 [20  7 94]]


In [43]:
kmeans = KMeans(n_clusters=2, random_state=42)
data_kmeans["Cluster"] = kmeans.fit_predict(data_kmeans[["SalePrice"]])

cluster_means = data_kmeans.groupby("Cluster")["SalePrice"].mean().sort_values()
label_map = {cluster_means.index[0]: "cheap", cluster_means.index[1]: "expensive"}

data_kmeans["PriceCategory"] = data_kmeans["Cluster"].map(label_map)

data_kmeans = data_kmeans.drop(columns=["Cluster", "SalePrice"])

In [44]:
X_kmeans = data_kmeans.drop(columns=["PriceCategory"])
y_kmeans = data_kmeans["PriceCategory"]

numeric_cols = X_kmeans.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X_kmeans.select_dtypes(include=["object"]).columns

In [45]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [46]:
pipe_kmeans = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

In [47]:
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(
    X_kmeans, y_kmeans, test_size=0.2, random_state=42, stratify=y_kmeans
)

pipe_kmeans.fit(X_train_k, y_train_k)

In [48]:
y_pred_k = pipe_kmeans.predict(X_test_k)
accuracy_kmeans = accuracy_score(y_test_k, y_pred_k)

print(f"Accuracy for KMeans categories): {accuracy_kmeans:.3f}")
print("\nClassification report:\n", classification_report(y_test_k, y_pred_k))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_k, y_pred_k))

Accuracy for KMeans categories): 0.825

Classification report:
               precision    recall  f1-score   support

       cheap       0.89      0.88      0.88       179
   expensive       0.62      0.65      0.64        55

    accuracy                           0.82       234
   macro avg       0.76      0.77      0.76       234
weighted avg       0.83      0.82      0.83       234


Confusion Matrix:
 [[157  22]
 [ 19  36]]


In [49]:
print(f"3 class categories: {accuracy:.3f}")
print(f"2 clusters with KMeans: {accuracy_kmeans:.3f}")

3 class categories: 0.756
2 clusters with KMeans: 0.825


    The model with KMeans categories (2 clusters) had a higher accuracy (0.825) than the model with manual 3 classes (0.756), this means the KMeans approach worked better.

In [50]:
pipe_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

In [51]:
pipe_rf.fit(X_train, y_train)

In [52]:
y_pred_rf = pipe_rf.predict(X_test)

In [53]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 0.778

Classification Report:
               precision    recall  f1-score   support

       cheap       0.78      0.81      0.79        95
   expensive       0.69      0.61      0.65        18
      medium       0.79      0.78      0.78       121

    accuracy                           0.78       234
   macro avg       0.75      0.73      0.74       234
weighted avg       0.78      0.78      0.78       234


Confusion Matrix:
 [[77  0 18]
 [ 0 11  7]
 [22  5 94]]


In [54]:
pipe_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42))
])

param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    pipe_rf,
    param_grid,
    cv=5,               
    scoring="accuracy",  
    n_jobs=-1,           
)

In [55]:
grid_search.fit(X_train, y_train)

In [56]:
print(grid_search.best_params_)

{'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}


In [57]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

In [58]:
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"\nOptimized Random Forest Accuracy: {accuracy_best:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Optimized Random Forest Accuracy: 0.765

Classification Report:
               precision    recall  f1-score   support

       cheap       0.76      0.81      0.79        95
   expensive       0.73      0.44      0.55        18
      medium       0.77      0.78      0.77       121

    accuracy                           0.76       234
   macro avg       0.75      0.68      0.70       234
weighted avg       0.76      0.76      0.76       234


Confusion Matrix:
 [[77  0 18]
 [ 0  8 10]
 [24  3 94]]


In [59]:
print(f"Decision Tree (3 Price Categories): {accuracy:.3f}")
print(f"2 Clusters - KMeans: {accuracy_kmeans:.3f}")
print(f"Random Forest Accuracy: {accuracy_rf:.3f}")
print(f"Optimized Random Forest Accuracy with GridSearchCV: {accuracy_best:.3f}")

Decision Tree (3 Price Categories): 0.756
2 Clusters - KMeans: 0.825
Random Forest Accuracy: 0.778
Optimized Random Forest Accuracy with GridSearchCV: 0.765
