# Exploring data
We will explore the data using pandas.

Take a note that pandas allows us to load data from xlsx Excel (HARGA RUMAH MALANG.xlsx).

In [None]:
import pandas as pd
data = pd.read_excel('HARGA RUMAH MALANG.xlsx', sheet_name = 'Sheet1', skiprows = 1)

In [None]:
print(data)

            HARGA    LT   LB  JKT  JKM        GRS    KOTA
0     28000000000  1100  700    5    6        ADA  MALANG
1     19000000000   824  800    4    4        ADA  MALANG
2      4700000000   500  400    4    3        ADA  MALANG
3      4900000000   251  300    5    4        ADA  MALANG
4     28000000000  1340  575    4    5        ADA  MALANG
...           ...   ...  ...  ...  ...        ...     ...
996   16000000000   488  550    6    5        ADA  MALANG
997    4500000000   209  270    4    4        ADA  MALANG
998   29000000000   692  400    4    3  TIDAK ADA  MALANG
999    1700000000   102  140    4    3  TIDAK ADA  MALANG
1000   1250000000    63  110    3    3  TIDAK ADA  MALANG

[1001 rows x 7 columns]


In [None]:
pd.DataFrame(data['HARGA'].describe())

Unnamed: 0,HARGA
count,1001.0
mean,17474720000.0
std,20795480000.0
min,430000000.0
25%,6750000000.0
50%,13500000000.0
75%,20000000000.0
max,250000000000.0


#  Data Preparation

- data.drop(columns='KOTA', inplace=True, axis=1), Operasi ini menghapus
kolom 'KOTA' dari DataFrame ‘data’ karena berisikan nilai yang sama.

- data.GRS = data.GRS.map({'ADA':1,'TIDAK ADA':0}), Operasi ini
mengganti nilai dalam kolom 'GRS' dengan 1 jika nilainya adalah 'ADA' dan
dengan 0 jika nilainya adalah 'TIDAK ADA'.


In [None]:
data.drop(columns='KOTA', inplace=True, axis=1)

In [None]:
data.GRS = data.GRS.map({'ADA':1,'TIDAK ADA':0})

In [None]:
data.head()

Unnamed: 0,HARGA,LT,LB,JKT,JKM,GRS
0,28000000000,1100,700,5,6,1
1,19000000000,824,800,4,4,1
2,4700000000,500,400,4,3,1
3,4900000000,251,300,5,4,1
4,28000000000,1340,575,4,5,1


#  Data Segregation

- Menghapus nilai-nilai di luar kisaran persentil 1 hingga 99 (1%
hingga 99%) dari setiap kolom yang tercantum dalam dataset data untuk
memperbaiki distribusi data dan menghilangkan nilai-nilai yang ekstrem.


In [None]:
data = data[(data.LT>data.LT.quantile(0.01))&(data.LT<data.LT.quantile(0.99))]
data = data[(data.LB>data.LB.quantile(0.01))&(data.LB<data.LB.quantile(0.99))]
data = data[(data.JKT>data.JKT.quantile(0.01))&(data.JKT<data.JKT.quantile(0.99))]
data = data[(data.JKM>data.JKM.quantile(0.01))&(data.JKM<data.JKM.quantile(0.99))]
data = data[(data.HARGA>data.HARGA.quantile(0.01))&(data.HARGA<data.HARGA.quantile(0.99))]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
X = data.iloc[:,1:]
X

Unnamed: 0,LT,LB,JKT,JKM,GRS
0,1100,700,5,6,1
1,824,800,4,4,1
2,500,400,4,3,1
3,251,300,5,4,1
4,1340,575,4,5,1
...,...,...,...,...,...
993,169,215,4,4,1
996,488,550,6,5,1
997,209,270,4,4,1
998,692,400,4,3,0


In [None]:
y = data.iloc[:,0]
y

0      28000000000
1      19000000000
2       4700000000
3       4900000000
4      28000000000
          ...     
993     3500000000
996    16000000000
997     4500000000
998    29000000000
999     1700000000
Name: HARGA, Length: 823, dtype: int64

# Split Data Into Training and Testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train

Unnamed: 0,LT,LB,JKT,JKM,GRS
271,407,850,6,5,1
841,769,400,4,3,1
911,404,837,4,4,1
496,520,600,4,4,0
344,445,1200,6,6,1
...,...,...,...,...,...
83,188,367,3,3,1
122,241,571,4,5,1
305,380,350,6,4,1
500,350,460,6,5,1


In [None]:
y_train

271    17000000000
841    11000000000
911    28000000000
496    19000000000
344    16000000000
          ...     
83      7800000000
122    11900000000
305    14000000000
500     9500000000
118     3500000000
Name: HARGA, Length: 658, dtype: int64

# Perform Training

- Model machine learning yang digunakan adalah RandomForestRegressor. RandomForest adalah jenis model dalam kategori supervised learning yang dapat 
digunakan untuk permasalahan regresi dan klasifikasi. Dalam konteks ini, digunakan 
untuk memprediksi harga rumah (sebuah nilai numerik), maka itu merupakan masalah 
regresi, dan RandomForestRegressor adalah model yang tepat untuk digunakan.

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
hasil = model.predict(X_test)

In [None]:
hasil

array([1.80280000e+10, 3.24750000e+09, 6.39466548e+09, 5.50950000e+09,
       1.57139000e+10, 2.19260000e+10, 2.43650000e+09, 1.24660000e+10,
       4.21750000e+09, 2.99896000e+10, 2.12975000e+10, 1.58038000e+10,
       2.20930000e+10, 1.39600000e+10, 1.59375000e+10, 1.75012500e+10,
       1.51203500e+10, 1.36067500e+10, 6.45350000e+09, 4.03100000e+10,
       3.20500000e+09, 1.33970000e+10, 9.69300000e+09, 2.47068000e+10,
       9.53650000e+09, 2.61449000e+10, 2.37500000e+10, 1.29865000e+10,
       5.22290000e+09, 1.12236000e+10, 2.43950000e+10, 2.78760000e+10,
       2.42240000e+10, 2.34010000e+10, 2.02293333e+10, 1.79212500e+10,
       1.07675000e+10, 5.42367857e+10, 3.54430000e+09, 9.69200000e+09,
       6.85000000e+09, 1.11062500e+10, 2.17135000e+10, 2.23465000e+10,
       9.05670000e+09, 4.03100000e+10, 2.40380000e+10, 5.93220000e+09,
       1.12240000e+10, 2.36140000e+09, 3.73250000e+10, 2.81510000e+10,
       7.14270000e+09, 2.10300000e+10, 1.46304000e+10, 1.64851667e+10,
      

# Prediction

In [None]:
def prediksi(LT, LB, JKT, JKM, GRS):
    predict = pd.DataFrame()
    predict['LT'] = [LT]
    predict['LB'] = [LB]
    predict['JKT'] = [JKT]
    predict['JKM'] = [JKM]
    predict['GRS'] = [GRS]
    hasil = model.predict(predict)
    return hasil[0]  # Mengambil nilai prediksi dari array hasil

In [None]:
nilai = prediksi(500, 400, 4, 3, 1)
nilai_miliar = nilai / 1e9  # Konversi ke miliar

print(prediksi(500, 400, 4, 3, 1))
print("{:.2f} miliar".format(nilai_miliar))

13843600000.0
13.84 miliar


# Model Evaluation

- Penggunaan Mean Squared Error (MSE) sebagai metrik evaluasi
model memberikan informasi mengenai seberapa baik model dapat memprediksi
harga.

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2_score = r2_score(y_test, y_pred)
mae_score = mean_absolute_error(y_test, y_pred)
mse_score = mean_squared_error(y_test, y_pred)
rmse_score = mean_squared_error(y_test, y_pred, squared=True) # RMSE diaktifkan dengan parameter squared=True

print(f'Skor R2: {r2_score}')
print(f'Skor MAE: {mae_score}')
print(f'Skor MSE: {mse_score}')
print(f'Skor RMSE: {rmse_score}')

Skor R2: 0.6313860796558384
Skor MAE: 4022352431.6760683
Skor MSE: 4.542858335786416e+19
Skor RMSE: 4.542858335786416e+19
