# house-price-prediction

In [1]:
# =============================
# 1. IMPORT LIBRARIES
# =============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor
import joblib
import json


In [4]:
# =============================
# 2. LOAD DATASET
# =============================

df = pd.read_csv("C:/Users/HP/OneDrive/Desktop/hii/code ninja.csv")
print("Shape:", df.shape)
print(df.head())


Shape: (14620, 23)
           id        Date  number of bedrooms  number of bathrooms  \
0  6762810145  01-05-2016                   5                 2.50   
1  6762810635  01-05-2016                   4                 2.50   
2  6762810998  01-05-2016                   5                 2.75   
3  6762812605  01-05-2016                   4                 2.50   
4  6762812919  01-05-2016                   3                 2.00   

   living area  lot area  number of floors  waterfront present  \
0         3650      9050               2.0                   0   
1         2920      4000               1.5                   0   
2         2910      9480               1.5                   0   
3         3310     42998               2.0                   0   
4         2710      4500               1.5                   0   

   number of views  condition of the house  ...  Built Year  Renovation Year  \
0                4                       5  ...        1921                0   
1  

# Define Target + Split Data

In [5]:
# =============================
# 3. TRAIN-TEST SPLIT
# =============================

target = "Price"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save raw splits
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

print("Saved train_data.csv and test_data.csv")


Saved train_data.csv and test_data.csv


#  Prepare Features

In [6]:
# =============================
# 4. FEATURE SELECTION
# =============================

numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove("Price")  # remove target

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

print("Numeric:", numeric_features)
print("Categorical:", categorical_features)


Numeric: ['id', 'number of bedrooms', 'number of bathrooms', 'living area', 'lot area', 'number of floors', 'waterfront present', 'number of views', 'condition of the house', 'grade of the house', 'Area of the house(excluding basement)', 'Area of the basement', 'Built Year', 'Renovation Year', 'Postal Code', 'Lattitude', 'Longitude', 'living_area_renov', 'lot_area_renov', 'Number of schools nearby', 'Distance from the airport']
Categorical: ['Date']


# Preprocessing Pipeline

In [7]:
# =============================
# 5. PREPROCESSING PIPELINE
# =============================

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


# Baseline Models (Mean, Median) 

In [8]:
# =============================
# 6. BASELINE MODELS
# =============================

y_train = train_df[target]
y_test = test_df[target]

baseline_mean = np.full(len(y_test), y_train.mean())
baseline_median = np.full(len(y_test), y_train.median())

def evaluate(true, pred):
    return {
        "RMSE": np.sqrt(mean_squared_error(true, pred)),
        "MAE": mean_absolute_error(true, pred),
        "R2": r2_score(true, pred),
        "MAPE": np.mean(np.abs((true - pred) / true)) * 100
    }

print("Baseline Mean:", evaluate(y_test, baseline_mean))
print("Baseline Median:", evaluate(y_test, baseline_median))


Baseline Mean: {'RMSE': 375435.64927683136, 'MAE': 237863.70480051177, 'R2': -0.00023400981297516665, 'MAPE': 53.28680554200582}
Baseline Median: {'RMSE': 386867.038259615, 'MAE': 226052.4107387141, 'R2': -0.062072240394457134, 'MAPE': 42.516165269838005}


# Simple Linear Regression

In [9]:
# =============================
# 7. SIMPLE LINEAR REGRESSION
# =============================

slr_feature = ["living area"]

slr = Pipeline(steps=[
    ('preprocess', ColumnTransformer([
        ('num', StandardScaler(), slr_feature)
    ])),
    ('model', LinearRegression())
])

slr.fit(train_df[slr_feature], y_train)
slr_pred = slr.predict(test_df[slr_feature])

print("SLR:", evaluate(y_test, slr_pred))


SLR: {'RMSE': 259221.00718410025, 'MAE': 172726.51822970866, 'R2': 0.5231618571679189, 'MAPE': 35.399442484800296}


# Multiple Linear Regression

In [10]:
# =============================
# 8. MULTIPLE LINEAR REGRESSION
# =============================

mlr = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

mlr.fit(train_df, y_train)
mlr_pred = mlr.predict(test_df)

print("MLR:", evaluate(y_test, mlr_pred))


MLR: {'RMSE': 188837.0293619012, 'MAE': 107120.23891562248, 'R2': 0.7469508282985143, 'MAPE': 21.179185564914203}


# Regularized Models (Ridge, Lasso, ElasticNet)

In [11]:
# =============================
# 9. REGULARIZED MODELS
# =============================

models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001),
    "ElasticNet": ElasticNet(alpha=0.001, l1_ratio=0.5)
}

leaderboard = {}

for name, model in models.items():
    pipe = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', model)
    ])
    
    pipe.fit(train_df, y_train)
    pred = pipe.predict(test_df)
    leaderboard[name] = evaluate(y_test, pred)

leaderboard


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


{'Ridge': {'RMSE': 188587.23425412114,
  'MAE': 106969.46333272199,
  'R2': 0.7476198563773322,
  'MAPE': 21.164211880719886},
 'Lasso': {'RMSE': 188839.87840603816,
  'MAE': 107122.84568971343,
  'R2': 0.7469431925747368,
  'MAPE': 21.179836668641972},
 'ElasticNet': {'RMSE': 188392.0080411262,
  'MAE': 106648.95161437368,
  'R2': 0.7481421156595529,
  'MAPE': 21.06632523771035}}

# Select Champion Model

In [12]:
# =============================
# 10. SELECT BEST MODEL
# =============================

best_model_name = min(leaderboard, key=lambda x: leaderboard[x]['RMSE'])
print("Champion Model:", best_model_name)

champion_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', models[best_model_name])
])

champion_model.fit(train_df, y_train)


Champion Model: ElasticNet


  model = cd_fast.sparse_enet_coordinate_descent(


Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['id', 'number of bedrooms',
                                                   'number of bathrooms',
                                                   'living area', 'lot area',
                                                   'number of floors',
                                                   'waterfront present',
                                                   'number of views',
                                                   'condition of the house',
                                                   'grade of the house',
                                                   'Area of the '
                                                   'house(excluding basement)',
                                                   'Area of the basement',
                                                   'Built Year',
             

# Save Champion Model

In [13]:
joblib.dump(champion_model, "champion_model.joblib")
print("Saved champion_model.joblib")


Saved champion_model.joblib


# Save Predictions

In [14]:
test_df["Predicted_Price"] = champion_model.predict(test_df)
test_df[["id", "Price", "Predicted_Price"]].to_csv("houseprice_predictions_test.csv", index=False)


# Save Metrics

In [15]:
with open("test_metrics.json", "w") as f:
    json.dump(leaderboard[best_model_name], f, indent=4)

print("Saved test_metrics.json")


Saved test_metrics.json


In [16]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [17]:
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)


In [18]:
import os
os.getcwd()


'C:\\Users\\HP'

In [19]:
save_path = r"C:\Users\HP\OneDrive\Desktop\hii"


In [20]:
import os

os.makedirs(save_path, exist_ok=True)
print("Saving all files to:", save_path)


Saving all files to: C:\Users\HP\OneDrive\Desktop\hii


In [21]:
train_df.to_csv(os.path.join(save_path, "train_data.csv"), index=False)
test_df.to_csv(os.path.join(save_path, "test_data.csv"), index=False)

print("Train and Test data saved successfully!")


Train and Test data saved successfully!


In [22]:
import joblib

joblib.dump(champion_model, os.path.join(save_path, "champion_model.joblib"))
print("Model saved!")


Model saved!


In [23]:
test_df["Predicted_Price"] = champion_model.predict(test_df)

test_df[["id", "Price", "Predicted_Price"]].to_csv(
    os.path.join(save_path, "houseprice_predictions_test.csv"),
    index=False
)

print("Predictions saved!")


Predictions saved!


In [24]:
import json

with open(os.path.join(save_path, "test_metrics.json"), "w") as f:
    json.dump(leaderboard[best_model_name], f, indent=4)

print("Metrics saved!")


Metrics saved!
