# Importing nesessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
warnings.filterwarnings('ignore')

# loading the dataset from kaggle

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kunwarakash/chennai-housing-sales-price",path="Chennai houseing sale.csv")

# Preparing the dataset, removing nan values and eliminating duplicates

In [3]:
data = pd.read_csv(path)
pd.set_option('display.max_columns', None)
d=data.copy()

d.AREA=d.AREA.str.lower()
d.SALE_COND=d.SALE_COND.str.lower()
d.PARK_FACIL=d.PARK_FACIL.str.lower()
d.BUILDTYPE=d.BUILDTYPE.str.lower()
d.UTILITY_AVAIL=d.UTILITY_AVAIL.str.lower()
d.STREET=d.STREET.str.lower()
d.replace({'AREA':{'velchery':'velachery',
                       'kknagar':'kk nagar',
                       'tnagar':'t nagar',
                       'chormpet':'chrompet',
                       'chrompt':'chrompet',
                       'chrmpet':'chrompet',
                       'ana nagar':'anna nagar',
                       'ann nagar':'anna nagar',
                       'karapakam':'karapakkam',
                       'adyr':'adyar'},
           'N_BEDROOM':{np.nan:round(d.N_BEDROOM.mean())},'N_BATHROOM':{np.nan:round(d.N_BATHROOM.mean())},
           'SALE_COND':{'ab normal':'abnormal','partiall':'partial','adj land':'adjland','normal sale':'normalsale'},
           'PARK_FACIL':{'no':0,'noo':0,'yes':1},
           'BUILDTYPE':{'comercial':'commercial','others':'other'},
           'UTILITY_AVAIL':{'all pub':'allpub','nosewr ':'nosewr', 'nosewa':'nosewr'},
           'STREET':{'no access':'noaccess','pavd':'paved'}},inplace=True)

d.N_BEDROOM = d.N_BEDROOM.astype(int)
d.N_BATHROOM = d.N_BATHROOM.astype(int)

# Correcting Date columnd from object to daterime format
d.DATE_SALE = pd.to_datetime(d.DATE_SALE, format='%d-%m-%Y')
d.DATE_BUILD = pd.to_datetime(d.DATE_BUILD, format='%d-%m-%Y')

# Creating PROPERTY_AGE column which deternine how old the property id
d['PROPERTY_AGE'] = pd.DatetimeIndex(d.DATE_SALE).year - pd.DatetimeIndex(d.DATE_BUILD).year

# Rearranging the columns of dframe
d = d.reindex(columns = [ 'AREA', 'SALE_COND', 'PARK_FACIL',
       'BUILDTYPE', 'UTILITY_AVAIL', 'STREET', 'MZZONE', 'PROPERTY_AGE',
       'INT_SQFT', 'N_BEDROOM','N_BATHROOM', 'N_ROOM','SALES_PRICE'])

In [4]:
d.head(3)

Unnamed: 0,AREA,SALE_COND,PARK_FACIL,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,PROPERTY_AGE,INT_SQFT,N_BEDROOM,N_BATHROOM,N_ROOM,SALES_PRICE
0,karapakkam,abnormal,1,commercial,allpub,paved,A,44,1004,1,1,3,7600000
1,anna nagar,abnormal,0,commercial,allpub,gravel,RH,11,1986,2,1,5,21717770
2,adyar,abnormal,1,commercial,elo,gravel,RL,20,909,1,1,3,13159200


# One Hot Encoding

In [5]:
X,y=d.drop(columns=['SALES_PRICE']),d.SALES_PRICE
print(X.shape,y.shape)

(7109, 12) (7109,)


In [6]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ob_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ob_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [7]:
X = preprocessor.fit_transform(X)
X.shape

(7109, 33)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((5687, 33), (1422, 33))

In [9]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [10]:
models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Random Forest
Model performance for Training set
- Root Mean Squared Error: 160079.1920
- Mean Absolute Error: 121299.5308
- R2 Score: 0.9982
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 414904.3007
- Mean Absolute Error: 319329.2089
- R2 Score: 0.9869


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 654712.3644
- Mean Absolute Error: 495539.8172
- R2 Score: 0.9674


Gradient Boosting
Model performance for Training set
- Root Mean Squared Error: 364894.7333
- Mean Absolute Error: 281602.2321
- R2 Score: 0.9908
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 384738.5312
- Mean Absolute Error: 298735.4678
- R2 Score: 0.9887


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 776779.2260
- Mea

In [11]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,CatBoosting Regressor,0.999335
4,XGBRegressor,0.995739
2,Gradient Boosting,0.988733
0,Random Forest,0.986897
1,Decision Tree,0.967372
3,Linear Regression,0.955627
6,AdaBoost Regressor,0.896376
