<a href="https://colab.research.google.com/github/SatriaImawan12/House-Price-Prediction-System-with-Multiple-Linear-Regression/blob/main/House_Price_Prediction_System_with_Multiple_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Regresi Linear Berganda**

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import library yang dibutuhkan
import pandas as pd

# Proses ekstraksi data
data_houseprice = pd.read_csv('price_house.csv')
data_houseprice_two_predictor = data_houseprice.loc[:, ['total_rooms', 'median_income', 'median_house_value']]

display(data_houseprice_two_predictor)

Unnamed: 0,total_rooms,median_income,median_house_value
0,880.0,8.3252,452600.0
1,7099.0,8.3014,358500.0
2,1467.0,7.2574,352100.0
3,1274.0,5.6431,341300.0
4,1627.0,3.8462,342200.0
...,...,...,...
19643,1665.0,1.5603,78100.0
19644,697.0,2.5568,77100.0
19645,2254.0,1.7000,92300.0
19646,1860.0,1.8672,84700.0


In [None]:
# Informasi umum pada data
data_houseprice_two_predictor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19648 entries, 0 to 19647
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   total_rooms         19648 non-null  float64
 1   median_income       19648 non-null  float64
 2   median_house_value  19648 non-null  float64
dtypes: float64(3)
memory usage: 460.6 KB


# **2. Data Cleansing**

## **2.1 Periksa Missing Value**

In [None]:
data_houseprice_two_predictor.isna().sum()

total_rooms           0
median_income         0
median_house_value    0
dtype: int64

## **2.2 Periksa Duplicated Data**

In [None]:
data_houseprice_two_predictor.duplicated().sum()

0

# **3. Exploratory Data Analysis**

## **3.1 Statistika Deskriptif**

In [None]:
def statistika_deskriptif(data):
    statistics = pd.DataFrame({
        'mean' : data.mean().round(2),
        'median': data.median(),
        'modus': data.mode().iloc[0],
        'min': data.min(),
        'max': data.max(),
        'q1' : data.quantile(0.25),
        'q2' : data.quantile(0.50),
        'q3' : data.quantile(0.75),
        'stdev' : data.std().round(2)
    })

    display(statistics.T)

In [None]:
# Tampilkan statistik deskriptifnya
statistika_deskriptif(data_houseprice_two_predictor)

Unnamed: 0,total_rooms,median_income,median_house_value
mean,2620.34,3.68,192055.33
median,2111.0,3.44905,173600.0
modus,1527.0,3.125,137500.0
min,2.0,0.4999,14999.0
max,39320.0,15.0001,499100.0
q1,1438.0,2.5263,116475.0
q2,2111.0,3.44905,173600.0
q3,3121.0,4.5825,247900.0
stdev,2182.37,1.57,97110.85


## **3.2 Korelasi**

In [None]:
# Import library visualisasi data
import plotly.express as px

# Pembuatan heatmap
fig = px.imshow(
    data_houseprice_two_predictor.corr(),
    text_auto = True,
    color_continuous_scale = 'Blues'
)

# Hapus scale legend
fig.update_coloraxes(
    showscale = False
)

# Ubah beberapa pengaturan
fig.update_layout(
    width = 800,
    height = 600,
    title = dict(
        text = "<b>Korelasi Income dan House Value</b>",
        font = dict(
            size = 19,
            color = '#0E2954'
        ),
        y = 0.95,
        x = 0.5
    ),
    margin = dict(
        t = 80,
        b = 30,
        r = 50,
        l = 50
    )
)

# Tampilkan hasilnya
fig.show()

In [None]:
# Import library visualisasi
import plotly.express as px

# Buat grafik titik
fig = px.scatter(
    data_houseprice,
    x = 'median_income',
    y = 'median_house_value',
    color_discrete_sequence  = ['#0E2954'],
    opacity = 0.5,
    title = '<b>Median Income VS Median House Value</b>'
)

# Tampilkan hasilnya
fig.show()

In [None]:
# Import library visualisasi
import plotly.express as px

# Buat grafik titik
fig = px.scatter(
    data_houseprice,
    x = 'total_rooms',
    y = 'median_house_value',
    color_discrete_sequence  = ['#FF8911'],
    opacity = 0.5,
    title = '<b>Total Rooms VS Median House Value</b>'
)

# Tampilkan hasilnya
fig.show()

In [None]:
import plotly.graph_objects as go

# Membuat plot 3D
fig = go.Figure(
    data = [
        go.Scatter3d(
            x = X_train['total_rooms'],
            y = X_train['median_income'],
            z = y_train,
            mode = 'markers',
            marker = dict(
                size = 5,
                color = '#A367B1',
                opacity = 0.5
            ),
            hovertemplate='Total Rooms: %{x}<br>Median Income: %{y}<br>House Price: %{z}'
        )
    ]
)


fig.update_layout(
    width = 700,
    height = 700,
    title = dict(
        text = '<b>Total Rooms vs Median Income vs Median House Price</b>',
        font = dict(
            size = 20
        )
    ),
    scene = dict(
        xaxis_title = 'Total Rooms',
        yaxis_title = 'Median Income',
        zaxis_title = 'Median House Price'
    )
)

# Tampilkan plot
fig.show()

## **3.3 Distribusi Data**

In [None]:
import plotly.express as px

# Definisikan warna
colors = ['#492E87', '#FFA33C', '#FF8911']

for col, colour in zip(data_houseprice_two_predictor.columns, colors):
    fig = px.histogram(
        data_houseprice_two_predictor,
        x = col,
        marginal = 'box',
        color_discrete_sequence  = [colour],
        nbins = 50
    )

    fig.update_traces(
          marker_line_width = 1,
          marker_line_color = 'white'
    )

    fig.update_layout(
        plot_bgcolor = 'rgba(0, 0, 0, 0)',
        title = dict(
            text = f"<b>Distribusi <span style='color:{colour}'>{col}</b>",
            font = dict(
                size = 28,
                color = '#757882'
            ),
            y = 0.92,
            x = 0.5
        ),
        yaxis = dict(
            title = '',
            showgrid = False,
            showline = False,
            showticklabels = False,
            zeroline = False,
        ),
        margin = dict(
            t = 80,
            b = 10,
            r = 20
        )
    )

    fig.show()

# **4. Modelling**

## **4.1 Data Splitting**

In [None]:
# Import library untuk splitting data
from sklearn.model_selection import train_test_split

# Definisikan var. independen dan var. dependen
X = data_houseprice_two_predictor[['total_rooms', 'median_income']]
y = data_houseprice_two_predictor['median_house_value']

# Proses splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42
)

# Tampilkan jumlah masing - masing data
print(f'Jumlah data asli  : {len(X)}')
print(f'Jumlah data train : {len(X_train)}')
print(f'Jumlah data test  : {len(X_test)}')

Jumlah data asli  : 19648
Jumlah data train : 15718
Jumlah data test  : 3930


## **4.2 Model Fitting**

In [None]:
# Import library untuk regresi linear
from sklearn.linear_model import LinearRegression

# Panggil class Linear Regression
modelLR = LinearRegression()

# Proses fitting model
modelLR.fit(X_train, y_train)

## **4.3 Prediksi**

In [None]:
# Proses prediksi setelah fitting model
y_pred_test = modelLR.predict(X_test)

In [None]:
# Import library pandas
import pandas as pd

# Reset index
X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

# Mengonversi y_pred menjadi DataFrame
y_pred_test_df = pd.DataFrame(y_pred_test, columns=['median_house_value_predicted'])

# Menggabungkan DataFrame X_test, y_test, dan y_pred_df menjadi satu DataFrame
result_df = pd.concat([X_test, y_test, y_pred_test_df], axis=1)

# Hitung selisih model vs aktual
result_df['residu'] = abs(result_df['median_house_value'] - result_df['median_house_value_predicted'])

# Tampilkan data
display(result_df)

Unnamed: 0,total_rooms,median_income,median_house_value,median_house_value_predicted,residu
0,1807.0,5.0357,329800.0,246248.300912,83551.699088
1,1683.0,4.7604,294700.0,235251.138604,59448.861396
2,2505.0,4.8359,195700.0,238275.657513,42575.657513
3,5568.0,3.0795,161500.0,168156.325992,6656.325992
4,2714.0,3.6031,275000.0,189038.430641,85961.569359
...,...,...,...,...,...
3925,2033.0,3.6667,418400.0,191571.268614,226828.731386
3926,1999.0,2.8750,96000.0,159949.454299,63949.454299
3927,1170.0,1.6098,87500.0,109406.819890,21906.819890
3928,5357.0,1.9311,138100.0,122285.558498,15814.441502


## **4.4 Evaluasi Kinerja Model**

In [None]:
# Import library evaluasi metrik
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Proses prediksi setelah fitting model
y_pred_train = modelLR.predict(X_train)

# Evaluasi kinerja model
MSE = mean_squared_error(y_train, y_pred_train)
MAE = mean_absolute_error(y_train, y_pred_train)

# Tampilkan hasil
print(f'Mean Squared Error Data Train  : {MSE}')
print(f'Mean Absolute Error Data Train : {MAE}')

Mean Squared Error Data Train  : 5445074944.116557
Mean Absolute Error Data Train : 56213.483070828406


In [None]:
# Import library evaluasi metrik
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Proses prediksi setelah fitting model
y_pred_test = modelLR.predict(X_test)

# Evaluasi kinerja model
MSE = mean_squared_error(y_test, y_pred_test)
MAE = mean_absolute_error(y_test, y_pred_test)

# Tampilkan hasil
print(f'Mean Squared Error Data Test  : {MSE}')
print(f'Mean Absolute Error Data Test : {MAE}')

Mean Squared Error Data Test  : 5649557550.562728
Mean Absolute Error Data Test : 57279.990640182754


## **4.5 Representasi Model**

In [None]:
print("Koefisien (slope):", modelLR.coef_)
print("Intersep (intercept):", modelLR.intercept_)

Koefisien (slope): [1.08986828e-02 3.99411946e+04]
Intersep (intercept): 45096.73337238576


Model Regresi : `y = slope1 * x1 + slope2 * x2 + intercept`
> ### Median House Value =  0.0109 \* total Rooms + 39941.19 \* Median Income + 45096.73

In [None]:
import numpy as np
import plotly.graph_objects as go

intercept, coef_x1, coef_x2 = modelLR.intercept_, modelLR.coef_[0], modelLR.coef_[1]

# Membuat grid untuk bidang regresi
x1_range = np.linspace(
    X_train['total_rooms'].min(),
    X_train['total_rooms'].max(),
    20
)

x2_range = np.linspace(
    X_train['median_income'].min(),
    X_train['median_income'].max(),
    20
)

x1_grid, x2_grid = np.meshgrid(x1_range, x2_range)
z_grid = intercept + coef_x1 * x1_grid + coef_x2 * x2_grid

# Membuat plot 3D
fig = go.Figure(
    data = [
        go.Scatter3d(
            x = X_train['total_rooms'],
            y = X_train['median_income'],
            z = y_train,
            mode = 'markers',
            marker = dict(
                size = 5,
                color = '#A367B1',
                opacity = 0.5
            ),
            hovertemplate = 'Total Rooms: %{x}<br>Median Income: %{y}<br>House Price: %{z}'
        ),
        go.Surface(
            x = x1_range,
            y = x2_range,
            z = z_grid,
            colorscale = [[0, '#124076'], [1, '#124076']],
            opacity = 0.7,
            showscale = False,
        )
    ]
)

# Menambahkan judul dan label sumbu
fig.update_layout(
    width = 700,
    height = 700,
    title = dict(
        text = '<b>Hasil Regresi Linear Ganda</b>',
        font = dict(
            size = 30
        )
    ),
    scene = dict(
        xaxis_title = 'Total Rooms',
        yaxis_title = 'Median Income',
        zaxis_title = 'Median House Price'
    )
)

# Menampilkan plot
fig.show()

# **5. Uji Lanjut**



## **5.1 Statistik Lanjut**

* `R-squared` : Berfungsi untuk mengetahui persentase besarnya pengaruh
predictor terhadap criterion. Nilainya berkisar dari 0 (makin buruk hasil) sampai dengan 1 (makin baik hasil)
*  `Adj. R-squared` : R2 dengan mempertimbangkan jumlah sample data dan jumlah variabel yang digunakan.
* `Prob (F-Statistics)` : Uji pengaruh predictor terhadap criterion. Jika nilainya kurang dari alpha (error toleransi) maka Regresi Y atas X adalah signifikan atau predictor berpengaruh terhadap criterion jika nilainya lebih dari sama dengan alpha maka tidak ada pengaruh
* `Log-Likelihood` : Memberikan informasi tentang kesesuaian model dengan data tanpa mempertimbangkan kompleksitas model
* `AIC dan BIC` : Metrik yang menggabungkan kesesuaian model terhadap data dan kompleksitas model. (semakin kecil nilainya menunjukan kualitas model yang baik)

https://www.statsmodels.org/stable/regression.html

In [None]:
# Import library yang dibutuhkan
import statsmodels.api as sm
from statsmodels.tools import add_constant

# Definisikan X dan Y
X = X_train
y = y_train

# Tambahkan sebuah konstanta pada dataframe (prasyarat perhitungan)
X = add_constant(X)

# Ordinary Least Square (OLS) menggunakan stats model
model = sm.OLS(y, X)

# Hitung dan dapatkan hasilnya
result = model.fit()

# Tampilkan hasil summary dengan toleransi error 5%
stat_summary = result.summary(alpha = 0.05)

# Tampilkan hasil
print(stat_summary)

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.420
Model:                            OLS   Adj. R-squared:                  0.420
Method:                 Least Squares   F-statistic:                     5694.
Date:                Wed, 27 Mar 2024   Prob (F-statistic):               0.00
Time:                        04:51:08   Log-Likelihood:            -1.9849e+05
No. Observations:               15718   AIC:                         3.970e+05
Df Residuals:                   15715   BIC:                         3.970e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const           4.51e+04   1551.645     29.064

## **5.2 Uji Heteroskedastisitas**

Prinsip dari cara mendeteksi heteroskedastisitas dengan grafik adalah memeriksa pola `residual`
terhadap taksiran dari `y_pred`

Heteroskedastisitas terjadi bila varians error tidak konstan, sehingga seakan-akan ada beberapa kelompok data yang mempunyai besaran error yang berbeda-beda sehingga bila diplotkan dengan nilai akan membentuk suatu pola yang sistematis.

<img src="https://jagostat.com/img/anareg/homoskedastisitas/pola%20homoskedastis%20dan%20heteroskedastis.jpg">

In [None]:
# Import library visualisasi
import plotly.express as px

# Hitung residual
residuals = y_test - y_pred_test

# Buat grafik titik
fig = px.scatter(
    x = y_pred_test,
    y = residuals,
    color_discrete_sequence  = ['#444eff'],
    opacity = 0.5,
)

fig.add_hline(
    y = 0,
    line_dash = 'dash',
    line_color = '#12086F'
)

# Ubah beberapa pengaturan
fig.update_layout(
    width = 1200,
    height = 800,
    title = dict(
        text = "<b>Residual VS Hasil Prediksi</b>",
        font = dict(
            size = 30,
            color = '#12086F'
        ),
        y = 0.95,
        x = 0.5
    ),
    margin = dict(
        t = 80,
        b = 30,
        r = 50,
        l = 50
    )
)

# Tampilkan hasilnya
fig.show()

---