In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

data = pd.read_csv("AmesHousing.xls")

data = data.dropna(thresh=len(data) * 0.5, axis=1)
data = data.dropna(subset=["SalePrice"])

data["PriceCategory"] = (data["SalePrice"] > data["SalePrice"].median()).astype(int)

X = data.drop(columns=["SalePrice", "PriceCategory"])
y = data["PriceCategory"]

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="median")
X_num = pd.DataFrame(num_imputer.fit_transform(X[num_cols]), columns=num_cols)

cat_imputer = SimpleImputer(strategy="most_frequent")
X_cat = pd.DataFrame(cat_imputer.fit_transform(X[cat_cols]), columns=cat_cols)

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat_encoded = pd.DataFrame(encoder.fit_transform(X_cat), columns=encoder.get_feature_names_out(cat_cols))

scaler = StandardScaler()
X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=num_cols)

X_final = pd.concat([X_num_scaled, X_cat_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9402730375426621

Confusion Matrix:
 [[264  17]
 [ 18 287]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       281
           1       0.94      0.94      0.94       305

    accuracy                           0.94       586
   macro avg       0.94      0.94      0.94       586
weighted avg       0.94      0.94      0.94       586

