In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
def load_data(file_path):
  return pd.read_csv(file_path)

def split_data(X, y, test_size):
  return train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)

def preprocess_data(X_train, X_test):
  imputer = SimpleImputer(strategy="mean")
  X_train_imputer = imputer.fit_transform(X_train)
  X_test_imputer = imputer.transform(X_test)

  return X_train_imputer, X_test_imputer

def train_model(X, y, n_estimators):
  model = XGBRegressor(n_estimators=n_estimators)
  model.fit(X, y)
  return model

def evaluasi_model(y_test, y_pred):
  mae = mean_absolute_error(y_test, y_pred)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, y_pred)

  return mae, mse, rmse, r2

In [3]:
df = load_data("dataset/house.csv")
df.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age,price
0,1,26.184098,1286.68,204.003817,22,67,96004.804557
1,1,34.866901,1855.25,186.98036,8,30,92473.722568
2,1,36.980709,692.09,111.224999,24,24,98112.519942
3,1,17.445723,1399.49,237.99876,1,66,92118.326874
4,1,52.587646,84.65,100.9964,20,3,98976.653176


In [4]:
df.shape

(4308, 7)

In [5]:
df.isna().sum()

bedroom_count      0
net_sqm            0
center_distance    0
metro_distance     0
floor              0
age                0
price              0
dtype: int64

In [6]:
df.columns

Index(['bedroom_count', 'net_sqm', 'center_distance', 'metro_distance',
       'floor', 'age', 'price'],
      dtype='object')

In [7]:
df.describe()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age,price
count,4308.0,4308.0,4308.0,4308.0,4308.0,4308.0,4308.0
mean,3.732823,118.941327,1090.544301,105.665358,9.397168,48.576834,95701.196185
std,2.476989,95.469802,555.32583,60.750825,7.517018,27.621465,3919.823988
min,1.0,10.244518,11.8,1.245338,1.0,0.0,86113.592974
25%,2.0,52.240659,635.8975,56.156897,2.0,25.0,92934.251436
50%,3.0,91.828201,1143.135,101.388012,8.0,48.0,95337.792918
75%,5.0,150.987591,1554.28,148.481068,16.0,72.0,97980.675094
max,17.0,750.971604,1999.84,330.275317,24.0,97.0,118134.77119


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4308 entries, 0 to 4307
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   bedroom_count    4308 non-null   int64  
 1   net_sqm          4308 non-null   float64
 2   center_distance  4308 non-null   float64
 3   metro_distance   4308 non-null   float64
 4   floor            4308 non-null   int64  
 5   age              4308 non-null   int64  
 6   price            4308 non-null   float64
dtypes: float64(4), int64(3)
memory usage: 235.7 KB


In [9]:
X = df.drop('price', axis=1)
X.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
0,1,26.184098,1286.68,204.003817,22,67
1,1,34.866901,1855.25,186.98036,8,30
2,1,36.980709,692.09,111.224999,24,24
3,1,17.445723,1399.49,237.99876,1,66
4,1,52.587646,84.65,100.9964,20,3


In [10]:
y = df['price']
y.head()

0    96004.804557
1    92473.722568
2    98112.519942
3    92118.326874
4    98976.653176
Name: price, dtype: float64

In [11]:
X_train, X_test, y_train, y_test = split_data(X, y, 0.2)

In [12]:
X_train.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
3286,8,238.640159,1029.44,111.374614,1,54
4280,6,473.708857,123.42,167.717534,1,9
2377,4,203.717225,1112.67,50.735466,2,23
4244,2,220.029788,454.25,77.710586,6,13
1104,2,100.780953,1869.89,100.868884,13,25


In [13]:
X_test.head()

Unnamed: 0,bedroom_count,net_sqm,center_distance,metro_distance,floor,age
151,1,23.670546,1774.98,125.270711,23,18
3867,2,34.815382,1209.04,24.234915,17,52
3164,7,164.494052,787.15,69.409338,1,90
2755,5,209.362234,1170.81,66.298667,3,0
3450,10,120.723984,662.37,49.785438,1,33


In [14]:
y_train.head()

3286     98366.903534
4280    106403.678778
2377     99126.692937
4244    102526.419462
1104     90641.134338
Name: price, dtype: float64

In [15]:
y_test.head()

151     90429.791221
3867    91026.059515
3164    96813.639519
2755    98810.323710
3450    98027.896979
Name: price, dtype: float64

In [16]:
X_preprocess_train, X_preprocess_test = preprocess_data(X_train, X_test)

In [18]:
model = train_model(X_preprocess_train, y_train, 100)

In [20]:
y_pred = model.predict(X_preprocess_test)

In [21]:
mae, mse, rmse, r2 = evaluasi_model(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MAE: 1726.0552667425031
MSE: 5494672.821644661
RMSE: 2344.0718465193554
R^2: 0.6307829421474593
