In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix

In [39]:
df = pd.read_csv("house_sales_data.csv")

In [40]:
print(df.isnull().sum())

Square_Footage            0
Bedrooms                  0
Bathrooms                 0
Age                       0
Garage_Spaces            26
Lot_Size                 26
Floors                    0
Neighborhood_Rating       0
Condition                 0
School_Rating            32
Has_Pool                  0
Renovated                 0
Location_Type             0
Distance_To_Center_KM     0
Days_On_Market           28
Price                     0
Sold_Within_Week          0
dtype: int64


In [41]:
targets = ["Price", "Sold_Within_Week"]
features = df.drop(columns=targets)
for col in features.columns:
    if df[col].dtype in ["float64", "int64"]:
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
    else:
        median_val = df[col].median() if pd.api.types.is_numeric_dtype(df[col]) else df[col].mode()[0]
        df[col].fillna(median_val, inplace=True)

print(df.isnull().sum())


Square_Footage           0
Bedrooms                 0
Bathrooms                0
Age                      0
Garage_Spaces            0
Lot_Size                 0
Floors                   0
Neighborhood_Rating      0
Condition                0
School_Rating            0
Has_Pool                 0
Renovated                0
Location_Type            0
Distance_To_Center_KM    0
Days_On_Market           0
Price                    0
Sold_Within_Week         0
dtype: int64


In [42]:
X = df.drop(columns=["Price","Sold_Within_Week"])
y_price = df["Price"]
y_sold = df["Sold_Within_Week"]

In [43]:
categorical = ["Location_Type"]
numerical = X.columns.drop(categorical)

In [44]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
])

In [45]:
linreg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.3, random_state=42)
linreg_pipeline.fit(X_train, y_train)
y_pred = linreg_pipeline.predict(X_test)

print("Linear Regression R²:", r2_score(y_test, y_pred))
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

Linear Regression R²: 0.8379579106447366
Linear Regression RMSE: 78725.04457077035


In [46]:
logreg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y_sold, test_size=0.3, random_state=42)
logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.9633333333333334
Precision: 0.9337349397590361
Recall: 1.0
Confusion Matrix:
 [[134  11]
 [  0 155]]


In [47]:
import joblib

In [48]:
joblib.dump(linreg_pipeline, "price_predicter.pkl")

['price_predicter.pkl']

In [49]:
joblib.dump(logreg_pipeline, "sales_predicter.pkl")

['sales_predicter.pkl']