### Model

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

df = pd.read_csv('../data/processed_dataset.csv')
df.head()

Unnamed: 0,harga,jumlah_kamar_tidur,jumlah_kamar_mandi,luas_tanah,luas_bangunan,carport,pasokan_listrik,kab/kota,kecamatan,keamanan,taman,jarak_rs_terdekat,jarak_sekolah_terdekat,jarak_tol_terdekat
0,0.008829,0.083333,0.04918,0.061611,0.04,0.075,0.042646,2,68,0,0,0.001076,0.003667,0.045251
1,0.00162,0.041667,0.04918,0.010319,0.0153,0.025,0.008703,2,68,1,0,0.001198,0.003473,0.042765
2,0.031155,0.055556,0.081967,0.253657,0.1107,0.2,0.032637,2,10,0,0,0.000954,0.003862,0.047737
3,0.006178,0.041667,0.016393,0.068924,0.0708,0.125,0.018277,2,79,1,0,0.001443,0.003862,0.047737
4,0.00255,0.027778,0.016393,0.013224,0.02,0.125,0.037424,2,29,0,0,0.000587,0.004446,0.055196


In [30]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = df.drop('harga', axis=1)
y = df['harga']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2401 entries, 0 to 2400
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   harga                   2401 non-null   float64
 1   jumlah_kamar_tidur      2401 non-null   float64
 2   jumlah_kamar_mandi      2401 non-null   float64
 3   luas_tanah              2401 non-null   float64
 4   luas_bangunan           2401 non-null   float64
 5   carport                 2401 non-null   float64
 6   pasokan_listrik         2401 non-null   float64
 7   kab/kota                2401 non-null   int64  
 8   kecamatan               2401 non-null   int64  
 9   keamanan                2401 non-null   int64  
 10  taman                   2401 non-null   int64  
 11  jarak_rs_terdekat       2401 non-null   float64
 12  jarak_sekolah_terdekat  2401 non-null   float64
 13  jarak_tol_terdekat      2401 non-null   float64
dtypes: float64(10), int64(4)
memory usage: 2

In [32]:
from sklearn.metrics import mean_absolute_error, r2_score

def evaluate_model(model, X_train, X_test, y_train, y_test):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    mae_train = round(mean_absolute_error(y_train, y_train_pred), 4) * 100
    mae_test = round(mean_absolute_error(y_test, y_test_pred), 4) * 100
    
    print(f"Mean Absolute Error \n Train: {mae_train} % | Test: {mae_test} %")
    
    return mae_train, mae_test

#### Baseline Model

Linear Regression

In [33]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
mae_train_lr, mae_test_lr = evaluate_model(lr, X_train, X_test, y_train, y_test) 

Mean Absolute Error 
 Train: 3.0700000000000003 % | Test: 3.37 %


SVR (Support Vector Regression)

In [34]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)
mae_train_svr, mae_test_svr = evaluate_model(svr, X_train, X_test, y_train, y_test) 

Mean Absolute Error 
 Train: 8.61 % | Test: 8.9 %


XGBRegressor

In [35]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)
mae_train_xgb, mae_test_xgb = evaluate_model(xgb, X_train, X_test, y_train, y_test) 

Mean Absolute Error 
 Train: 0.13 % | Test: 2.11 %


Random Forest

In [36]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
mae_train_rf, mae_test_rf = evaluate_model(rf, X_train, X_test, y_train, y_test) 

Mean Absolute Error 
 Train: 0.58 % | Test: 1.7999999999999998 %


Decision Tree

In [37]:
from sklearn.tree import DecisionTreeRegressor

dt = RandomForestRegressor()
dt.fit(X_train, y_train)
mae_train_dt, mae_test_dt = evaluate_model(dt, X_train, X_test, y_train, y_test) 

Mean Absolute Error 
 Train: 0.59 % | Test: 1.8800000000000001 %


Model Evaluation

In [38]:
pd.DataFrame({
    'Baseline Model': ['Linear Regression', 'SVR', 'XGBoost', 'Random Forest', 'Decision Tree'],
    'MAE': [mae_test_lr, mae_test_svr, mae_test_xgb, mae_test_rf, mae_test_dt],
}).sort_values(by='MAE').reset_index(drop=True)

Unnamed: 0,Baseline Model,MAE
0,Random Forest,1.8
1,Decision Tree,1.88
2,XGBoost,2.11
3,Linear Regression,3.37
4,SVR,8.9


In [39]:
pd.DataFrame({
    'Baseline Model': ['Linear Regression', 'SVR', 'XGBoost', 'Random Forest', 'Decision Tree'],
    'MAE Train': [mae_train_lr, mae_train_svr, mae_train_xgb, mae_train_rf, mae_train_dt],
    'MAE Test': [mae_test_lr, mae_test_svr, mae_test_xgb, mae_test_rf, mae_test_dt],
}).sort_values(by=['MAE Train', 'MAE Test']).reset_index(drop=True)

Unnamed: 0,Baseline Model,MAE Train,MAE Test
0,XGBoost,0.13,2.11
1,Random Forest,0.58,1.8
2,Decision Tree,0.59,1.88
3,Linear Regression,3.07,3.37
4,SVR,8.61,8.9


#### Hyperparameter Tuning

In [26]:
from sklearn.model_selection import GridSearchCV, KFold

kfold = KFold(n_splits=5, shuffle=True)

Linear Regression

In [27]:
lr_params = {'fit_intercept': [True, False], 'positive': [True, False]}
best_lr = GridSearchCV(LinearRegression(), lr_params, cv=kfold, n_jobs=-1, scoring='neg_mean_absolute_error')
best_lr.fit(X_train, y_train)

print(f"Best Params: {best_lr.best_params_}")

mae_train_best_lr, mae_test_best_lr = \
evaluate_model(best_lr.best_estimator_, X_train, X_test, y_train, y_test) 

Best Params: {'fit_intercept': False, 'positive': False}
Mean Absolute Error 
 Train: 3.2099999999999995 % | Test: 3.16 %


SVR

In [49]:
from sklearn.model_selection import RandomizedSearchCV

svr_params = {'C': range(1, 6), 'kernel': ['linear', 'rbf']}
best_svr = RandomizedSearchCV(SVR(), svr_params, cv=kfold, n_jobs=-1, scoring='neg_mean_absolute_error')
best_svr.fit(X_train, y_train)

print(f"Best Params: {best_svr.best_params_}")

mae_train_best_svr, mae_test_best_svr = \
evaluate_model(best_svr.best_estimator_, X_train, X_test, y_train, y_test) 

Best Params: {'kernel': 'rbf', 'C': 3}
Mean Absolute Error 
 Train: 8.04 % | Test: 8.27 %


XGBoost

In [44]:
xgb_params = {'n_estimators': range(0, 50, 5), 'learning_rate': np.linspace(0.01, 1, 10), 'max_depth': range(1, 5)}
best_xgb = GridSearchCV(XGBRegressor(), xgb_params, cv=kfold, n_jobs=-1, scoring='neg_mean_absolute_error')
best_xgb.fit(X_train, y_train)

print(f"Best Params: {best_xgb.best_params_}")
mae_train_best_xgb, mae_test_best_xgb = \
evaluate_model(best_xgb.best_estimator_, X_train, X_test, y_train, y_test) 

Best Params: {'learning_rate': 0.23, 'max_depth': 4, 'n_estimators': 25}
Mean Absolute Error 
 Train: 1.16 % | Test: 2.04 %


Random Forest

In [45]:
rf_params = {'n_estimators': range(0, 50, 5), 'max_depth': range(1, 10)}
best_rf = GridSearchCV(RandomForestRegressor(), rf_params, cv=kfold, n_jobs=-1, scoring='neg_mean_absolute_error')
best_rf.fit(X_train, y_train)

print(f"Best Params: {best_rf.best_params_}")
mae_train_best_rf, mae_test_best_rf = \
evaluate_model(best_rf.best_estimator_, X_train, X_test, y_train, y_test) 

Best Params: {'max_depth': 9, 'n_estimators': 45}
Mean Absolute Error 
 Train: 0.8500000000000001 % | Test: 1.9 %


Decision Tree

In [46]:
dt_params = {'max_depth': range(1, 10), 'min_samples_split': range(1, 10)}
best_dt = GridSearchCV(DecisionTreeRegressor(), dt_params, cv=kfold, n_jobs=-1, scoring='neg_mean_absolute_error')
best_dt.fit(X_train, y_train)

print(f"Best Params: {best_dt.best_params_}")
mae_train_best_dt, mae_test_best_dt = \
evaluate_model(best_dt.best_estimator_, X_train, X_test, y_train, y_test) 

Best Params: {'max_depth': 8, 'min_samples_split': 4}
Mean Absolute Error 
 Train: 0.8699999999999999 % | Test: 1.8800000000000001 %


Evaluation Hyperparameter Tuning Model

In [50]:
pd.DataFrame({
    'Model': ['Linear Regression', 'SVR', 'XGBoost', 'Random Forest', 'Decision Tree'],
    'Baseline MAE': [mae_test_lr, mae_test_svr, mae_test_xgb, mae_test_rf, mae_test_dt],
    'Best MAE' : [mae_test_best_lr, mae_test_best_svr, mae_test_best_xgb, mae_test_best_rf, mae_test_best_dt],
}).sort_values(by=['Baseline MAE', 'Baseline MAE']).reset_index(drop=True)

Unnamed: 0,Model,Baseline MAE,Best MAE
0,Random Forest,1.8,1.9
1,Decision Tree,1.88,1.88
2,XGBoost,2.11,2.04
3,Linear Regression,3.37,3.16
4,SVR,8.9,8.27


In [51]:
pd.DataFrame({
    'Model': ['Linear Regression', 'SVR', 'XGBoost', 'Random Forest', 'Decision Tree'],
    'Baseline MAE (Train)': [mae_train_lr, mae_train_svr, mae_train_xgb, mae_train_rf, mae_train_dt],
    'Baseline MAE (Test)': [mae_test_lr, mae_test_svr, mae_test_xgb, mae_test_rf, mae_test_dt],
    
    'Best MAE (Train)' : [mae_train_best_lr, mae_train_best_svr, mae_train_best_xgb, mae_train_best_rf, mae_train_best_dt],
    'Best MAE (Test)' : [mae_test_best_lr, mae_test_best_svr, mae_test_best_xgb, mae_test_best_rf, mae_test_best_dt],
}).sort_values(by=['Baseline MAE (Train)', 'Baseline MAE (Test)', 'Best MAE (Train)', 'Best MAE (Test)']).reset_index(drop=True)

Unnamed: 0,Model,Baseline MAE (Train),Baseline MAE (Test),Best MAE (Train),Best MAE (Test)
0,XGBoost,0.13,2.11,1.16,2.04
1,Random Forest,0.58,1.8,0.85,1.9
2,Decision Tree,0.59,1.88,0.87,1.88
3,Linear Regression,3.07,3.37,3.21,3.16
4,SVR,8.61,8.9,8.04,8.27


### Dumping Model

In [53]:
import joblib

joblib.dump(best_lr, 'lr_model.joblib')
joblib.dump(best_xgb, 'xgb_model.joblib')

joblib.dump(best_rf, 'rf_model.joblib')
joblib.dump(best_dt, 'dt_model.joblib')

['dt_model.joblib']