In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder  
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

In [23]:
Data = pd.read_csv("dataset.csv")
Data.drop(['date'], axis=1, inplace=True)
Data.drop(['month'], axis=1, inplace=True)
Data.shape

(20303, 12)

In [24]:
Data['total_rooms'] = Data['bedrooms'] + Data['real_bathrooms']
Data['area_per_room'] = Data['living_in_m2'] / Data['total_rooms']
Data['living_in_m2_squared'] = Data['living_in_m2'] ** 2
Data['quality_score'] = Data['grade'] + Data['nice_view'] + Data['perfect_condition']
Data['grade_area'] = Data['grade'] * Data['living_in_m2']
Data['bathroom_bedroom_ratio'] = Data['real_bathrooms'] / Data['bedrooms']

In [25]:
le = LabelEncoder()
Data['has_basement'] = le.fit_transform(Data['has_basement'])
Data['has_basement'].unique()
Data['has_basement'].value_counts()

has_basement
0    12635
1     7668
Name: count, dtype: int64

In [26]:
Data['renovated'] = le.fit_transform(Data['renovated'])
Data['renovated'].unique()
Data['renovated'].value_counts()

renovated
0    19542
1      761
Name: count, dtype: int64

In [27]:
Data['nice_view'] = le.fit_transform(Data['nice_view'])
Data['nice_view'].unique()
Data['nice_view'].value_counts()

nice_view
0    18751
1     1552
Name: count, dtype: int64

In [28]:
Data['perfect_condition'] = le.fit_transform(Data['perfect_condition'])
Data['perfect_condition'].unique()
Data['perfect_condition'].value_counts()

perfect_condition
0    18748
1     1555
Name: count, dtype: int64

In [29]:
Data['has_lavatory'] = le.fit_transform(Data['has_lavatory'])
Data['has_lavatory'].unique()
Data['has_lavatory'].value_counts()

has_lavatory
1    13826
0     6477
Name: count, dtype: int64

In [30]:
Data['single_floor'] = le.fit_transform(Data['single_floor'])
Data['single_floor'].unique()
Data['single_floor'].value_counts()

single_floor
1    10416
0     9887
Name: count, dtype: int64

In [31]:
Data.head()

Unnamed: 0,price,bedrooms,grade,has_basement,living_in_m2,renovated,nice_view,perfect_condition,real_bathrooms,has_lavatory,single_floor,quartile_zone,total_rooms,area_per_room,living_in_m2_squared,quality_score,grade_area,bathroom_bedroom_ratio
0,305000.0,2,1,0,76.18046,0,0,1,1,0,1,2,3,25.393487,5803.462486,2,76.18046,0.5
1,498000.0,3,2,1,210.88981,0,0,0,2,1,1,2,5,42.177962,44474.511962,2,421.77962,0.666667
2,590000.0,2,4,0,262.91549,0,0,0,2,1,0,2,4,65.728872,69124.554882,4,1051.66196,1.0
3,775000.0,3,3,0,159.79316,0,0,0,1,1,0,3,4,39.94829,25533.853983,3,479.37948,0.333333
4,350000.0,2,1,0,92.903,0,0,0,1,1,1,3,3,30.967667,8630.967409,1,92.903,0.5


In [32]:
X = Data.drop(columns=["price"])
X = X.select_dtypes(include=[np.number])  
y = Data["price"]
X_train1, X_test1, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train1)
X_test = scaler.transform(X_test1)

In [34]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)
linear_reg.fit(X_train, y_train)
linear_y_pred = linear_reg.predict(X_test)
r2_linear = r2_score(y_test, linear_y_pred)
print("R2 score:", r2_linear)

R2 score: 0.7352034905991857


In [35]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
ada_reg = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=11), n_estimators=50, random_state=42)
ada_reg.fit(X_train, y_train)
ada_y_pred = ada_reg.predict(X_test)
r2_ada = r2_score(y_test, ada_y_pred)
print("R2 score:", r2_ada)

R2 score: 0.7304572524260351


In [36]:
from sklearn.ensemble import RandomForestRegressor
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)
Random_forest_reg = RandomForestRegressor(n_estimators=200, random_state=42)
Random_forest_reg.fit(X_train, y_train)
random_y_pred = Random_forest_reg.predict(X_test)
r2_randomforest = r2_score(y_test, random_y_pred)
print("R2 score:", r2_randomforest)

R2 score: 0.7217345249580382


In [37]:
from sklearn.linear_model import RidgeCV
ridge_regression = RidgeCV(alphas=[0.1, 0.01, 0.001], cv=5)
ridge_regression.fit(X_train, y_train)
ridge_y_pred = ridge_regression.predict(X_test)
r2_ridge = r2_score(y_test, ridge_y_pred)
print("R2 score:", r2_ridge)

R2 score: 0.735203310545719


In [38]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R2 score:", r2_xgb)

R2 score: 0.7552279670208428


In [39]:
from sklearn.ensemble import GradientBoostingRegressor
gbr_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.11, random_state=42)
gbr_model.fit(X_train, y_train)
y_pred_gbr = gbr_model.predict(X_test)
r2_gbr = r2_score(y_test, y_pred_gbr)
print("R2 score:", r2_gbr)

R2 score: 0.7533186535286664


In [40]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(max_depth=7, random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
r2_dt = r2_score(y_test, y_pred_dt)
print("R2 score:", r2_dt)

R2 score: 0.7270124331928716


In [41]:
def ensemble_learning(models, X_test):   
    predictions = np.column_stack([model.predict(X_test) for model in models])
    final_prediction = np.mean(predictions, axis=1)
    return final_prediction

In [42]:
models = [xgb_model,gbr_model,Random_forest_reg,ridge_regression,dt_model] 
y_pred_ensemble = ensemble_learning(models, X_test)

In [43]:
r2 = r2_score(y_test, y_pred_ensemble)
print(f"R2 Score: {r2}")    

R2 Score: 0.7561365022605064
