In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RANSACRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from time import time

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Peksyaji/My_Home/main/Data/Rumah%20Jabodetabek.csv', sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   lokasi          607 non-null    object
 1   LT              607 non-null    int64 
 2   LB              607 non-null    int64 
 3   KT              607 non-null    int64 
 4   KM              607 non-null    int64 
 5   listrik         607 non-null    int64 
 6   garasi_carport  607 non-null    object
 7   harga           607 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 38.1+ KB


In [3]:
# Encoder lokasi
df['lokasi'] = df['lokasi'].map({
    'Kota Jakarta':0,
    'Kota Bogor':1,
    'Kabupaten Bogor':2,
    'Kota Depok':3,
    'Kota Tangerang':4,
    'Kota Bekasi':5,
    'Kabupaten Bekasi':6
})

# Encoder garasi/carport
df['garasi_carport'] = df['garasi_carport'].map({
    'Ada':0,
    'Tidak ada':1
})

In [4]:
# Membagi variabel bebas dan variabel target
X = df.iloc[:,:-1]
y = np.log(df.iloc[:,-1]) # Mentransformasi logaritma natural pada variabel harga

In [5]:
# Membagi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Regression Modelling

In [16]:
regresi = [RANSACRegressor(random_state=42),
           DecisionTreeRegressor(random_state=42),
           RandomForestRegressor(random_state=42)]

In [20]:
head = 4
for model in regresi[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start
    val_sil = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    score_val_sil = -val_sil
    print(model)
    print("\tWaktu latih: %0.3fs" % train_time)
    print("\tWaktu prediksi: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMSE:", mean_squared_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print('\tMSE validasi silang:', score_val_sil)
    print('\tRata-rata MSE validasi silang:', np.mean(score_val_sil))
    print()

RANSACRegressor(random_state=42)
	Waktu latih: 0.069s
	Waktu prediksi: 0.001s
	Explained variance: 0.6848717996335756
	MSE: 0.3755889908736598
	R2 score: 0.6774975188789456
	MSE validasi silang: [0.57752817 0.5295905  0.4532338  0.41772226 0.4063606 ]
	Rata-rata MSE validasi silang: 0.47688706631020006

DecisionTreeRegressor(random_state=42)
	Waktu latih: 0.003s
	Waktu prediksi: 0.001s
	Explained variance: 0.5576167168246708
	MSE: 0.524760036971141
	R2 score: 0.549410610990734
	MSE validasi silang: [0.36443689 0.36398484 0.23161175 0.29216962 0.33236159]
	Rata-rata MSE validasi silang: 0.31691293845261415

RandomForestRegressor(random_state=42)
	Waktu latih: 0.216s
	Waktu prediksi: 0.011s
	Explained variance: 0.7849550167261943
	MSE: 0.2584156577588465
	R2 score: 0.7781093354363229
	MSE validasi silang: [0.22526277 0.24864781 0.16715839 0.20926394 0.14317266]
	Rata-rata MSE validasi silang: 0.19870111428960074



# Hyperparameters Tuning

In [25]:
parameter = {'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
             'max_features':['sqrt', 'log2', None],
             'bootstrap':[True, False]
             }

grid = GridSearchCV(RandomForestRegressor(random_state=42), parameter)
model = grid.fit(X, y)
print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

{'bootstrap': True, 'criterion': 'squared_error', 'max_features': 'sqrt'} 

RandomForestRegressor(max_features='sqrt', random_state=42) 



In [26]:
rfr = RandomForestRegressor(max_features='sqrt', random_state=42)
rfr.fit(X_train, y_train)

# Simulasi

In [27]:
# Simulasi prediksi
df_test = pd.DataFrame(data={
    'lokasi': [1],
    'LT': [60],
    'LB': [60],
    'KT': [3],
    'KM': [2],
    'listrik': [2200],
    'garasi_carport': [1]
})
df_test[0:1]

Unnamed: 0,lokasi,LT,LB,KT,KM,listrik,garasi_carport
0,1,60,60,3,2,2200,1


In [28]:
pred_test = rfr.predict(df_test[0:1])
harga = np.e**pred_test
print('Harga rumah yang Anda idamkan sekitar Rp','%d'%harga[0])

Harga rumah yang Anda idamkan sekitar Rp 650934590
