In [3]:
#Importing required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/real-estate/Real_Estate.csv


In [16]:
real_estate_data = pd.read_csv('/kaggle/input/real-estate/Real_Estate.csv')

In [17]:
real_estate_data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [18]:
real_estate_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Transaction date                     414 non-null    object 
 1   House age                            414 non-null    float64
 2   Distance to the nearest MRT station  414 non-null    float64
 3   Number of convenience stores         414 non-null    int64  
 4   Latitude                             414 non-null    float64
 5   Longitude                            414 non-null    float64
 6   House price of unit area             414 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 22.8+ KB


In [19]:
#Convert transaction date to datetime
real_estate_data['Transaction date'] = pd.to_datetime(real_estate_data['Transaction date'])

In [20]:
real_estate_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Transaction date                     414 non-null    datetime64[ns]
 1   House age                            414 non-null    float64       
 2   Distance to the nearest MRT station  414 non-null    float64       
 3   Number of convenience stores         414 non-null    int64         
 4   Latitude                             414 non-null    float64       
 5   Longitude                            414 non-null    float64       
 6   House price of unit area             414 non-null    float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 22.8 KB


In [21]:
#Extract month and year
real_estate_data['Transaction Month'] = real_estate_data['Transaction date'].dt.month
real_estate_data['Transaction Year'] = real_estate_data['Transaction date'].dt.year

In [22]:
# Drop the transaction date column
real_estate_data = real_estate_data.drop(columns= ['Transaction date'])

In [23]:
#Extract predictor and target variable
x = real_estate_data.drop('House price of unit area', axis=1)
y = real_estate_data['House price of unit area']

In [24]:
#Split predictor and target variable into train and test
X_train, X_test, y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [25]:
#Standardize predictor variable
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape

(331, 7)

In [26]:
#Initilaize models
models = {
    'Linear Regression' : LinearRegression(),
    'Decision Tree' : DecisionTreeRegressor(random_state = 42),
    'Random Forest' : RandomForestRegressor(random_state = 42),
    'Gradient Boosting': GradientBoostingRegressor(random_state = 42 )
}

In [27]:
results = {}

In [28]:
#for loop to make predictions using all initialized models and calculate mean absolute error and r squared
for name, model in models.items():
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    results[name] = {'MAE':mae, 'R2': r2}

results_df= pd.DataFrame(results).T
print(results_df)

                         MAE        R2
Linear Regression   9.748246  0.529615
Decision Tree      11.573145  0.235071
Random Forest       9.848542  0.511959
Gradient Boosting  10.021650  0.474760


Linear Regression has the lowest MAE (9.75) and the highest R² (0.53), making it the best-performing model among those evaluated. It suggests that, despite its simplicity, Linear Regression is quite effective for this dataset.