# House Price Prediction: Model Development and Training

In [1]:
# Import necessary libraries
import os
import pickle
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from joblib import dump, load

Various models were trained using datasets subjected to different preprocessing techniques, including splitting with and without standardization and normalization. Upon evaluating the performance across the three datasets—standardized, normalized, and the raw split dataset—the most favorable results were obtained from the dataset without any standardization or normalization. Consequently, for the purpose of this demonstration, the raw split dataset has been selected for loading.

In [2]:
# Load the saved datasets
train_file_path = 'processed_data/split_data/train_data.csv'
val_file_path = 'processed_data/split_data/val_data.csv'
test_file_path = 'processed_data/split_data/test_data.csv'

train_df = pd.read_csv(train_file_path)
val_df = pd.read_csv(val_file_path)
test_df = pd.read_csv(test_file_path)

In [3]:
# Separate features (X) and target variable (y) for each dataset
target_column_name = 'price_in_lakhs'
X_train, y_train = train_df.drop(target_column_name, axis=1), train_df[target_column_name]
X_val, y_val = val_df.drop(target_column_name, axis=1), val_df[target_column_name]
X_test, y_test = test_df.drop(target_column_name, axis=1), test_df[target_column_name]

In [4]:
# Define a dictionary of models to try
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machine': SVR()
}

Hyperparameter tuning is actively pursued to optimize model performance and enhance predictive accuracy.

In [5]:
# Optional: Hyperparameter tuning using GridSearchCV for specific models
param_grid = {
    'Linear Regression': {},
    'Random Forest': {'n_estimators': [50, 60, 80, 100, 150, 200], 'max_depth': [None, 10, 15, 20, 30]},
    'Decision Tree': {'max_depth': [None, 5, 10, 15, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    'Support Vector Machine': {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
}

# Dictionary to store the best models
best_models = {}

for name, model in models.items():
    if name in param_grid:
        print(f'Tuning hyperparameters for {name}...')
        param_search = GridSearchCV(model, param_grid[name], scoring='neg_mean_squared_error', cv=5)
        param_search.fit(X_train, y_train)
        best_params = param_search.best_params_
        print(f'Best hyperparameters for {name}: {best_params}\n')
        
        # Redefine the model with the best hyperparameters
        best_model = model.__class__(**best_params)
        best_models[name] = best_model

Tuning hyperparameters for Linear Regression...
Best hyperparameters for Linear Regression: {}

Tuning hyperparameters for Decision Tree...
Best hyperparameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}

Tuning hyperparameters for Random Forest...
Best hyperparameters for Random Forest: {'max_depth': 30, 'n_estimators': 60}

Tuning hyperparameters for Support Vector Machine...
Best hyperparameters for Support Vector Machine: {'C': 1000, 'gamma': 0.01}



In [6]:
best_models #best models and thier hyperparameters

{'Linear Regression': LinearRegression(),
 'Decision Tree': DecisionTreeRegressor(max_depth=10, min_samples_leaf=2),
 'Random Forest': RandomForestRegressor(max_depth=30, n_estimators=60),
 'Support Vector Machine': SVR(C=1000, gamma=0.01)}

Train and Evaluate models along with thier best hyper-parameters

In [7]:
# Function to create a folder if it doesn't exist
def create_folder_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

# Create a folder for saved models if not exists
saved_models_folder = 'saved_models'
create_folder_if_not_exists(saved_models_folder)

# Create a sub-folder based on date-time
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_folder = os.path.join(saved_models_folder, timestamp)
create_folder_if_not_exists(model_folder)

# Model training and evaluation
for name, model in best_models.items():
    print(f'Training {name}...')

    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_val_pred = model.predict(X_val)
    
    # Evaluate the model
    mse = mean_squared_error(y_val, y_val_pred)
    print(f'Mean Squared Error on Validation Set: {mse}')
    
    # Optionally, you can save the trained model for future use
    model_save_path = os.path.join(model_folder, f'{name}_model.pkl')
    
    with open(model_save_path, 'wb') as file:
        pickle.dump(model, file)
    
    print(f'{name} model saved to {model_save_path}\n')

Training Linear Regression...
Mean Squared Error on Validation Set: 0.03714947667611415
Linear Regression model saved to saved_models/20231129_125247/Linear Regression_model.pkl

Training Decision Tree...
Mean Squared Error on Validation Set: 0.05057965238456952
Decision Tree model saved to saved_models/20231129_125247/Decision Tree_model.pkl

Training Random Forest...
Mean Squared Error on Validation Set: 0.030898949190345155
Random Forest model saved to saved_models/20231129_125247/Random Forest_model.pkl

Training Support Vector Machine...
Mean Squared Error on Validation Set: 0.01839588658272734
Support Vector Machine model saved to saved_models/20231129_125247/Support Vector Machine_model.pkl



In [8]:
# Load the saved models
loaded_models = {}
for name in best_models.keys():
    model_path = os.path.join(model_folder, f'{name}_model.pkl')
    
    with open(model_path, 'rb') as file:
        loaded_model = pickle.load(file)
    
    loaded_models[name] = loaded_model

In [9]:
# Create a DataFrame to store the results
results_df = pd.DataFrame({'Actual': y_test})

# Make predictions on the test set and compare against y_test values
for name, loaded_model in loaded_models.items():
    print(f'Evaluating {name} on the test set...')
    
    # Make predictions on the test set
    y_test_pred = loaded_model.predict(X_test)
    
    # Add predicted values to the DataFrame
    results_df[f'{name}_Predicted'] = y_test_pred
    
    # Calculate and add MSE to the DataFrame
    mse = mean_squared_error(y_test, y_test_pred)
    print(f'Mean Squared Error on Test Set ({name}): {mse}\n')

Evaluating Linear Regression on the test set...
Mean Squared Error on Test Set (Linear Regression): 0.05607257834191994

Evaluating Decision Tree on the test set...
Mean Squared Error on Test Set (Decision Tree): 0.08326471528478133

Evaluating Random Forest on the test set...
Mean Squared Error on Test Set (Random Forest): 0.06713350286676603

Evaluating Support Vector Machine on the test set...
Mean Squared Error on Test Set (Support Vector Machine): 0.031126915531599814



In [10]:
# Display results in a dataframe
print("--- Price in Lakhs ---")
results_df = np.expm1(results_df)
results_df

--- Price in Lakhs ---


Unnamed: 0,Actual,Linear Regression_Predicted,Decision Tree_Predicted,Random Forest_Predicted,Support Vector Machine_Predicted
0,187.0,204.087548,171.604776,170.96546,187.941672
1,40.0,39.061007,33.121721,38.442133,36.904572
2,73.0,94.741664,88.996296,88.521821,79.883553
3,87.0,97.124548,60.870833,77.844596,88.513928
4,177.0,110.450276,114.026084,108.205301,152.170656
5,118.0,98.9645,115.893114,103.860853,106.926997
6,45.75,84.383471,50.346776,66.713167,74.036462
7,58.95,74.041713,83.498521,86.427271,55.10794
8,53.99,45.9958,43.472098,43.353998,48.60786
9,80.0,87.233672,80.802999,83.405646,82.292426
