 **implementation of machine learning models for House Price Prediction using the Kaggle House Prices** 

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')


In [32]:
train = pd.read_csv("../ASSIGNMENT_5/train.csv")
test = pd.read_csv("../ASSIGNMENT_5/test.csv")

y = train['SalePrice']
X = train.drop(['SalePrice'], axis=1)


In [33]:
all_data = pd.concat([X, test], axis=0)

In [34]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data.drop(['Utilities'], axis=1, inplace=True)  

In [35]:

all_data.fillna(all_data.mode().iloc[0], inplace=True)
all_data = pd.get_dummies(all_data)

X = all_data[:len(train)]
X_test = all_data[len(train):]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print(f"\n{name} Evaluation:")
    print("R² Score :", r2_score(y_val, preds))
    print("RMSE      :", np.sqrt(mean_squared_error(y_val, preds)))
    print("MAE       :", mean_absolute_error(y_val, preds))
    return r2_score(y_val, preds)

In [53]:
xgb = XGBRegressor(random_state=42)
param_dist_xgb = {
    'n_estimators': [400, 200,300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9],
    'subsample': [0.7, 0.9, 1.0]
}

rs_xgb = RandomizedSearchCV(xgb, param_distributions=param_dist_xgb, n_iter=10, cv=3, scoring='r2', n_jobs=-1, random_state=42)
rs_xgb.fit(X_train, y_train)
evaluate_model("XGBoost (Tuned)", rs_xgb.best_estimator_)



XGBoost (Tuned) Evaluation:
R² Score : 0.9161829352378845
RMSE      : 25355.559232641666
MAE       : 15607.458984375


0.9161829352378845

In [54]:
rf = RandomForestRegressor(random_state=42)
param_grid_rf = {
    'n_estimators': [400, 200,300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

gs_rf = GridSearchCV(rf, param_grid_rf, cv=3, n_jobs=-1, scoring='r2')
gs_rf.fit(X_train, y_train)
evaluate_model("Random Forest (Tuned)", gs_rf.best_estimator_)



Random Forest (Tuned) Evaluation:
R² Score : 0.8870085806678427
RMSE      : 29439.447088739904
MAE       : 17566.174797465406


0.8870085806678427

In [55]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Best Random Forest": gs_rf.best_estimator_,
    "Best XGBoost": rs_xgb.best_estimator_
}
scores = {}
for name, model in models.items():
    score = evaluate_model(name, model)
    scores[name] = score


Linear Regression Evaluation:
R² Score : 0.8864534351013472
RMSE      : 29511.6789166291
MAE       : 18282.146968581666

Ridge Evaluation:
R² Score : 0.8837357404769963
RMSE      : 29862.766060405404
MAE       : 18975.16312991064

Lasso Evaluation:
R² Score : 0.8957911500382255
RMSE      : 28272.17488773423
MAE       : 17908.811135467506

Best Random Forest Evaluation:
R² Score : 0.8870085806678427
RMSE      : 29439.447088739904
MAE       : 17566.174797465406

Best XGBoost Evaluation:
R² Score : 0.9161829352378845
RMSE      : 25355.559232641666
MAE       : 15607.458984375


In [56]:
stack_model = StackingRegressor(
    estimators=[
        ("xgb", rs_xgb.best_estimator_),
        ("rf", gs_rf.best_estimator_),
        ("ridge", Ridge())
    ],
    final_estimator=LinearRegression(),
    n_jobs=-1
)
stack_score = evaluate_model("Stacking Regressor", stack_model)
scores["Stacking Regressor"] = stack_score


Stacking Regressor Evaluation:
R² Score : 0.918635203342532
RMSE      : 24981.88407915279
MAE       : 14757.336968049025


In [57]:
best_model = max(scores, key=scores.get)
print("\nBest model is:", best_model, "with R² Score:", scores[best_model])


Best model is: Stacking Regressor with R² Score: 0.918635203342532


In [58]:
final_model = models.get(best_model, stack_model if best_model == "Stacking Regressor" else None)
final_model.fit(X, y)
test_preds = final_model.predict(X_test)
test_preds = np.expm1(test_preds) 

In [59]:
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_preds
})
submission.to_csv("submission.csv", index=False)
print("\nSubmission file created: submission.csv")


Submission file created: submission.csv
