In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [44]:
def load_data(file_path):
  return pd.read_csv(file_path)

def split_data(X, y, test_size):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)
  return X_train, X_test, y_train, y_test

def min_max_scaler(X_train, X_test):
  scaler = MinMaxScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)
  return X_train, X_test

def train_model(X, y, n_estimators):
  model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
  model.fit(X, y)
  return model

def evaluasi_model(y_test, y_pred):
  mae = mean_absolute_error(y_test, y_pred)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, y_pred)

  return mae, mse, rmse, r2

In [45]:
df = load_data("dataset/house.csv")
df.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age,price
0,1,26.184098,1286.68,204.003817,22,67,96004.804557
1,1,34.866901,1855.25,186.98036,8,30,92473.722568
2,1,36.980709,692.09,111.224999,24,24,98112.519942
3,1,17.445723,1399.49,237.99876,1,66,92118.326874
4,1,52.587646,84.65,100.9964,20,3,98976.653176


In [46]:
X = df.drop('price', axis=1)
X.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
0,1,26.184098,1286.68,204.003817,22,67
1,1,34.866901,1855.25,186.98036,8,30
2,1,36.980709,692.09,111.224999,24,24
3,1,17.445723,1399.49,237.99876,1,66
4,1,52.587646,84.65,100.9964,20,3


In [47]:
y = df['price']
y.head()

0    96004.804557
1    92473.722568
2    98112.519942
3    92118.326874
4    98976.653176
Name: price, dtype: float64

In [48]:
X_train, X_test, y_train, y_test = split_data(X, y, 0.2)

In [49]:
X_train.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
3286,8,238.640159,1029.44,111.374614,1,54
4280,6,473.708857,123.42,167.717534,1,9
2377,4,203.717225,1112.67,50.735466,2,23
4244,2,220.029788,454.25,77.710586,6,13
1104,2,100.780953,1869.89,100.868884,13,25


In [50]:
X_test.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
151,1,23.670546,1774.98,125.270711,23,18
3867,2,34.815382,1209.04,24.234915,17,52
3164,7,164.494052,787.15,69.409338,1,90
2755,5,209.362234,1170.81,66.298667,3,0
3450,10,120.723984,662.37,49.785438,1,33


In [51]:
y_train.head()

3286     98366.903534
4280    106403.678778
2377     99126.692937
4244    102526.419462
1104     90641.134338
Name: price, dtype: float64

In [52]:
y_test.head()

151     90429.791221
3867    91026.059515
3164    96813.639519
2755    98810.323710
3450    98027.896979
Name: price, dtype: float64

# **Without Preprocessing**

In [53]:
model = train_model(X_train, y_train, 100)

In [54]:
y_pred = model.predict(X_test)

In [55]:
mae, mse, rmse, r2 = evaluasi_model(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MAE: 1593.7244922870088
MSE: 4525141.290579682
RMSE: 2127.237948744729
R^2: 0.6959310576066677


## **With Preprocessing**

In [56]:
X_scale_train, X_scale_test = min_max_scaler(X_train, X_test)
X_scale_train = pd.DataFrame(X_scale_train, columns=X_train.columns)
X_scale_test = pd.DataFrame(X_scale_test, columns=X_test.columns)

In [57]:
X_scale_train.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
0,0.4375,0.30834,0.511881,0.358059,0.0,0.556701
1,0.3125,0.625688,0.056146,0.541245,0.0,0.092784
2,0.1875,0.261193,0.553746,0.160905,0.043478,0.237113
3,0.0625,0.283215,0.222556,0.248609,0.217391,0.134021
4,0.0625,0.122226,0.934634,0.323902,0.521739,0.257732


In [58]:
X_scale_test.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
0,0.0,0.018125,0.886894,0.403239,0.956522,0.185567
1,0.0625,0.033171,0.602221,0.074745,0.695652,0.536082
2,0.375,0.208241,0.390007,0.221619,0.0,0.927835
3,0.25,0.268814,0.582991,0.211505,0.086957,0.0
4,0.5625,0.14915,0.327242,0.157817,0.0,0.340206


In [59]:
model = train_model(X_scale_train, y_train, 100)

In [60]:
y_pred = model.predict(X_scale_test)

In [61]:
mae, mse, rmse, r2 = evaluasi_model(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MAE: 1594.3450054730956
MSE: 4530683.505551324
RMSE: 2128.5402287838783
R^2: 0.6955586459322628
