In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, HuberRegressor)
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [18]:
data = pd.read_csv(r"C:\Users\sahil\OneDrive\Documents\USA_Housing.csv")
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0,USS Barnett\nFPO AP 44820
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5,USNS Raymond\nFPO AE 09386


In [20]:
X = data.drop(['Price', 'Address'], axis=1)
y = data['Price']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
models = {
    # Models that need scaled data
    'LinearRegression': LinearRegression(),
    'RobustRegression': HuberRegressor(),
    'RidgeRegression': Ridge(),
    'LassoRegression': Lasso(),
    'ElasticNet': ElasticNet(),
    'PolynomialRegression': Pipeline([
        ('poly', PolynomialFeatures(degree=4)),
        ('linear', LinearRegression())
    ]),
    'SGDRegressor': SGDRegressor(),
    'ANN': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000),
    'SVM': SVR(),
    'KNN': KNeighborsRegressor(),

    # Models that don’t need scaling
    'RandomForest': RandomForestRegressor(),
    'LGBM': lgb.LGBMRegressor(),
    'XGBoost': xgb.XGBRegressor()
}


In [None]:
results = []

for name, model in models.items():
    # Check if model requires scaling
    if name in ['LinearRegression', 'RobustRegression', 'RidgeRegression', 'LassoRegression',
                'ElasticNet', 'PolynomialRegression', 'SGDRegressor', 'ANN', 'SVM', 'KNN']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Evaluation
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'MSE': round(mse, 2),
        'R2': round(r2, 4)
    })

    # Save model
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

print("All models trained, evaluated, and saved as .pkl files.")



In [33]:
results_df = pd.DataFrame(results)
results_df.sort_values(by='R2', ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)
print(results_df)

                   Model         MAE           MSE       R2
0       RobustRegression    82659.92  1.054623e+10   0.9147
1        RidgeRegression    82658.16  1.054893e+10   0.9147
2       LinearRegression    82657.95  1.054972e+10   0.9146
3        LassoRegression    82657.87  1.054970e+10   0.9146
4           SGDRegressor    82769.35  1.055839e+10   0.9146
5   PolynomialRegression    84013.48  1.073798e+10   0.9131
6                   LGBM    92133.99  1.309771e+10   0.8940
7           RandomForest    98226.35  1.494635e+10   0.8791
8                XGBoost   101565.19  1.613868e+10   0.8694
9                    KNN   105521.78  1.710311e+10   0.8616
10            ElasticNet   121396.83  2.288246e+10   0.8149
11                   SVM   282858.36  1.234840e+11   0.0009
12                   ANN  1170541.13  1.467683e+12 -10.8746


In [37]:
results_df.to_csv('model_evaluation_results.csv', index=False)
print("Model evaluation saved to model_evaluation_results.csv")

Model evaluation saved to model_evaluation_results.csv
