In [17]:
# Importing necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, HuberRegressor
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [18]:
# Loading the data

df = pd.read_csv('USA_Housing.csv')
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [19]:
# Data preprocessing

x = df.drop(['Price', 'Address'], axis = 1)
y = df['Price']

In [20]:
x

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,79545.458574,5.682861,7.009188,4.09,23086.800503
1,79248.642455,6.002900,6.730821,3.09,40173.072174
2,61287.067179,5.865890,8.512727,5.13,36882.159400
3,63345.240046,7.188236,5.586729,3.26,34310.242831
4,59982.197226,5.040555,7.839388,4.23,26354.109472
...,...,...,...,...,...
4995,60567.944140,7.830362,6.137356,3.46,22837.361035
4996,78491.275435,6.999135,6.576763,4.02,25616.115489
4997,63390.686886,7.250591,4.805081,2.13,33266.145490
4998,68001.331235,5.534388,7.130144,5.44,42625.620156


In [21]:
y

0       1.059034e+06
1       1.505891e+06
2       1.058988e+06
3       1.260617e+06
4       6.309435e+05
            ...     
4995    1.060194e+06
4996    1.482618e+06
4997    1.030730e+06
4998    1.198657e+06
4999    1.298950e+06
Name: Price, Length: 5000, dtype: float64

In [22]:
# Splitting the data

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [23]:
# Define models

models = {
    'LinearRegression': LinearRegression(),
    'RobustRegression': HuberRegressor(),
    'RidgeRegression': Ridge(),
    'LassoRegression': Lasso(),
    'ElasticNet': ElasticNet(),
    'PolynomialRegression': Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ]),
    'SGDRegressor': SGDRegressor(),
    'ANN': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000),
    'RandomForest': RandomForestRegressor(),
    'SVM': SVR(),
    'LGBM': lgb.LGBMRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'KNN': KNeighborsRegressor()
}

In [24]:
# Training the models and evaluation

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred =  model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    })

    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1256
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 5
[LightGBM] [Info] Start training from score 1231911.452183


In [25]:
# Convert results into DataFrame and save to csv

results_df = pd.DataFrame(results)
results_df.to_csv('model_results.csv', index = False)

print('Models have been trained and saved to pickle files. Evaluation results have been saved to model_results.csv.')


Models have been trained and saved to pickle files. Evaluation results have been saved to model_results.csv.


### **Good Performers:**


Linear Regression, Ridge Regression, Lasso Regression:

R²: ~0.9146, indicating these models explain around 91.46% of the variance, which is a strong performance.
MSE and MAE are in reasonable ranges for these models as well.


Polynomial Regression:

Similar performance to the linear models, with an R² of around 0.9142.


ElasticNet:

R²: 0.8780, which is still a strong result, though slightly lower than the above models.


RandomForest and LGBM:

R² values around 0.8774 (RandomForest) and 0.8940 (LGBM), indicating they are still performing well but slightly behind the linear models.


### **Underperformers:**


Robust Regression:

R²: 0.5011, indicating this model explains only around 50% of the variance in the data, which is relatively low.


ANN (Neural Network):

R²: 0.5024, which is similarly low and indicates suboptimal performance. The high MSE and MAE also suggest issues with this model's fit.


KNN:

R²: 0.5114, indicating it explains around 51% of the variance, which is not great compared to other models.


SVM:

R²: ~0.0004, indicating this model is performing extremely poorly, almost like a random prediction. It has the highest MAE, making it unsuitable for this task.


SGD Regressor:

This model's MSE and MAE are astronomically high, and the R² is negative, which suggests a catastrophic failure in fitting the model (likely due to convergence issues).