In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, HuberRegressor
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [12]:
# Load dataset
data = pd.read_csv('USA_Housing.csv')
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.002900,6.730821,3.09,40173.072174,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.865890,8.512727,5.13,36882.159400,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1.260617e+06,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,6.309435e+05,USNS Raymond\nFPO AE 09386
...,...,...,...,...,...,...,...
4995,60567.944140,7.830362,6.137356,3.46,22837.361035,1.060194e+06,USNS Williams\nFPO AP 30153-7653
4996,78491.275435,6.999135,6.576763,4.02,25616.115489,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352"
4997,63390.686886,7.250591,4.805081,2.13,33266.145490,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01..."
4998,68001.331235,5.534388,7.130144,5.44,42625.620156,1.198657e+06,USS Wallace\nFPO AE 73316


In [13]:
data.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

In [4]:
# Preprocessing
X = data.drop(['Price', 'Address'], axis=1)
y = data['Price']

In [14]:
# Define numerical features
numerical_features = [
    'Avg. Area Income',
    'Avg. Area House Age',
    'Avg. Area Number of Rooms',
    'Avg. Area Number of Bedrooms',
    'Area Population'
]


In [15]:
# Preprocess the data
X_train, X_test, y_train, y_test = train_test_split(X[numerical_features], y, test_size=0.2, random_state=0)

In [16]:
# Define models
models = {
    'LinearRegression': LinearRegression(),
    'RidgeRegression': Ridge(),
    'LassoRegression': Lasso(),
    'ElasticNet': ElasticNet(),
    'SGDRegressor': SGDRegressor(),
    'RandomForest': RandomForestRegressor(),
    'SVM': SVR(),
    'LGBM': lgb.LGBMRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'KNN': KNeighborsRegressor()
}

In [17]:
# Train and evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'MAE': mae,
        'MSE': mse,
        'R2': r2
    })

    # Save model as a pickle file
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1256
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 5
[LightGBM] [Info] Start training from score 1231911.452183


In [18]:

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_results_2.csv', index=False)

print("Models have been trained and saved as pickle files. Evaluation results have been saved to model_evaluation_results.csv.")


Models have been trained and saved as pickle files. Evaluation results have been saved to model_evaluation_results.csv.


### **Good Performers:**


Linear Regression, Ridge Regression, Lasso Regression:

R²: ~0.9146, indicating these models explain around 91.46% of the variance, which is a strong performance.
MSE and MAE are in reasonable ranges for these models as well.


Polynomial Regression:

Similar performance to the linear models, with an R² of around 0.9142.


ElasticNet:

R²: 0.8780, which is still a strong result, though slightly lower than the above models.


RandomForest and LGBM:

R² values around 0.8774 (RandomForest) and 0.8940 (LGBM), indicating they are still performing well but slightly behind the linear models.


### **Underperformers:**


Robust Regression:

R²: 0.5011, indicating this model explains only around 50% of the variance in the data, which is relatively low.


ANN (Neural Network):

R²: 0.5024, which is similarly low and indicates suboptimal performance. The high MSE and MAE also suggest issues with this model's fit.


KNN:

R²: 0.5114, indicating it explains around 51% of the variance, which is not great compared to other models.


SVM:

R²: ~0.0004, indicating this model is performing extremely poorly, almost like a random prediction. It has the highest MAE, making it unsuitable for this task.


SGD Regressor:

This model's MSE and MAE are astronomically high, and the R² is negative, which suggests a catastrophic failure in fitting the model (likely due to convergence issues).