In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [89]:
# Showing path to the dataset I will be working on

In [90]:
housing_data = pd.read_csv(r'C:\Users\juliu\Downloads\Housing.csv')

In [91]:
# Overviewing the dataset

In [92]:
housing_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [93]:
housing_data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [94]:
housing_data.shape

(545, 13)

In [95]:
housing_data.dropna(inplace=True)

In [None]:
# Converting categorical variables to numerical

In [96]:
housing_data = pd.get_dummies(housing_data, columns=['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus'])

In [99]:
housing_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,False,True,True,False,...,False,True,False,False,True,False,True,True,False,False
1,12250000,8960,4,4,4,3,False,True,True,False,...,False,True,False,False,True,True,False,True,False,False
2,12250000,9960,3,2,2,2,False,True,True,False,...,True,True,False,True,False,False,True,False,True,False
3,12215000,7500,4,2,2,3,False,True,True,False,...,True,True,False,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,False,True,False,True,...,True,True,False,False,True,True,False,True,False,False


In [98]:
# Splitting the data into features(X) and target(y)

In [47]:
X = housing_data.drop('price', axis=1)
y = housing_data['price']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
regression_models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor()
]

In [None]:
# Now we fit each regression model and evaluate their performance

In [105]:
for model in regression_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model.__class__.__name__} MSE: {mse:.2f}, R^2: {r2:.2f}")

LinearRegression MSE: 1800793719718.96, R^2: 0.64
DecisionTreeRegressor MSE: 2727890775484.20, R^2: 0.46
RandomForestRegressor MSE: 2009346259516.32, R^2: 0.60


In [None]:
# From the results we can state that the linear regression model performs the best. It has the lowest Mean square error and
# highest R²

In [None]:
# Now we create a new column named 'price_range' and segment the data into three bins: low, medium, and high according to the price
# The new column is dropped from the labels and now we set as target our new 'price_range' column

In [51]:
housing_data['price_range'] = pd.cut(housing_data['price'], bins=[-1, 5000000, 8000000, np.inf], labels=['low', 'medium', 'high'])
X_class = housing_data.drop(['price', 'price_range'], axis=1)
y_class = housing_data['price_range']

In [52]:
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_class)
X_test_scaled = scaler.transform(X_test_class)

In [54]:
classification_models = [
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier()
]

In [55]:
for model in classification_models:
    model.fit(X_train_scaled, y_train_class)
    y_pred_class = model.predict(X_test_scaled)
    acc = accuracy_score(y_test_class, y_pred_class)
    prec = precision_score(y_test_class, y_pred_class, average='weighted')
    rec = recall_score(y_test_class, y_pred_class, average='weighted')
    f1 = f1_score(y_test_class, y_pred_class, average='weighted')
    print(f"{model.__class__.__name__} Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1-score: {f1:.2f}")

LogisticRegression Accuracy: 0.75, Precision: 0.74, Recall: 0.75, F1-score: 0.73
DecisionTreeClassifier Accuracy: 0.69, Precision: 0.67, Recall: 0.69, F1-score: 0.68
RandomForestClassifier Accuracy: 0.72, Precision: 0.70, Recall: 0.72, F1-score: 0.69


Given these results, if we had to choose one model for categorizing houses into price ranges based on this dataset, Logistic Regression would be the preferred choice due to its higher accuracy, precision, recall, and F1-score

Accuracy: 75% of the price range predictions made by the Logistic Regression model are correct.
Precision: When the model predicts a certain price range, it is correct about 74% of the time.
Recall: The model successfully identifies 75% of all relevant cases (actual price ranges).
F1-Score: The balance between precision and recall is 73%, indicating a good balance between the model's accuracy and its ability to recall relevant cases.