# Library

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Dataset

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("willianleite/boston-housing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/boston-housing-dataset


In [13]:
df = pd.read_csv('/kaggle/input/boston-housing-dataset/Boston.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [14]:
df = df.drop(columns=["Unnamed: 0"])
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [15]:
X = df.drop(columns=["medv"])   # medv = house cost (target)
y = df["medv"]

In [17]:
print(len(X))
print(len(y))

506
506


In [18]:
print(y)

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: medv, Length: 506, dtype: float64


# Split data

In [20]:
test_size = 0.2
test_count = int(len(X) * test_size)

X_test, X_train = X[:test_count], X[test_count:]
y_test, y_train = y[:test_count], y[test_count:]

In [22]:
print(len(X_test))
print(len(X_train))

101
405


# Normalizing 

In [23]:
#(x-mean)/std
# X / Xmax
def standardize(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)

    X_train_scaled = (X_train - mean) / std
    X_test_scaled = (X_test - mean) / std   # dùng mean, std của train
    return X_train_scaled, X_test_scaled

In [24]:
X_train_scaled, X_test_scaled = standardize(X_train, X_test)

In [40]:
X[:5]

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [38]:
X_train_scaled[:5]

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
101,-0.459036,-0.474975,-0.520355,-0.307182,-0.466356,0.650036,-0.010517,-0.284078,-0.645997,-0.306349,1.108679,0.461221,-0.737893
102,-0.446904,-0.474975,-0.520355,-0.307182,-0.466356,0.14615,0.501474,-0.351008,-0.645997,-0.306349,1.108679,-2.790358,-0.337517
103,-0.448722,-0.474975,-0.520355,-0.307182,-0.466356,-0.213004,0.574097,-0.351008,-0.645997,-0.306349,1.108679,0.450108,0.042569
104,-0.456356,-0.474975,-0.520355,-0.307182,-0.466356,-0.1728,0.668507,-0.490026,-0.645997,-0.306349,1.108679,0.432288,-0.107572
105,-0.457096,-0.474975,-0.520355,-0.307182,-0.466356,-0.596279,0.911794,-0.638701,-0.645997,-0.306349,1.108679,0.445903,0.452413


# My Model

In [25]:
# Build model
class Regression():
    def __init__(self,type="apply equation"):
        self.w = None
        self.w0 = None
        self.type = type

    def fit(self,X_train,y_train):
        X_aug = np.hstack([np.ones(X_train.shape[0]).reshape(-1,1),X_train])
        if self.type == "apply equation":
            try:
                ans = np.linalg.inv(X_aug.T@X_aug) @ X_aug.T @ y_train
                self.w = ans[1:]
                self.w0 = ans[0]
            except:
                raise ValueError('XTX không khả nghịch')
        else:
            alpha = 0.01
            epochs = 100
            w = np.ones(X_aug.shape[1])
            for i in range(epochs):
                w = w-alpha * (2*X_aug.T@(X_aug@w - y_train))
            self.w = w[1:]
            self.w0 = w[0]
            # return w

    def predict(self,X_test):
        return X_test@self.w + self.w0

# Training

In [27]:
regr_myself  = Regression()
regr_myself.fit(X_train_scaled,y_train) 

print("[w1, ... w_n] = ", regr_myself.w)
print("w0 = ", regr_myself.w0)

[w1, ... w_n] =  [-1.02931083  1.382778    0.39746536  0.71316583 -2.51290171  2.44284235
 -0.11723319 -3.7558184   2.96398287 -2.21580455 -2.30899041  0.92781705
 -4.29731965]
w0 =  22.575802469135805


# Testing

In [28]:
y_pred_myself = regr_myself.predict(X_test_scaled)
y_pred_myself[:10]

0    30.211907
1    24.790656
2    30.300002
3    28.071362
4    27.151249
5    24.833047
6    22.882569
7    18.687299
8    10.296962
9    18.134281
dtype: float64

Evaluate

In [31]:
def RMSE(y_pred,y_true):
    return np.sqrt(np.mean((y_pred-y_true)**2))
print("RMSE:",RMSE(y_pred_myself,y_test))

RMSE: 3.5460811398199366


# Using library

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled_2 = scaler.fit_transform(X_train)
X_test_scaled_2 = scaler.transform(X_test)

In [34]:
# library
regr_library = linear_model.LinearRegression()
regr_library.fit(X_train_scaled_2,y_train)

In [35]:
# Library
regr_library.fit(X_train_scaled_2, y_train)

print("[w1, ... w_n] = ", regr_library.coef_)
print("w0 = ", regr_library.intercept_)

[w1, ... w_n] =  [-1.02803929  1.38106981  0.39697436  0.71228484 -2.50979744  2.43982463
 -0.11708837 -3.75117873  2.96032137 -2.2130673  -2.30613804  0.92667089
 -4.29201104]
w0 =  22.575802469135798


In [36]:
y_pred_library = regr_library.predict(X_test_scaled_2)
y_pred_library[:10]

array([30.21190666, 24.79065611, 30.30000152, 28.07136213, 27.151249  ,
       24.83304736, 22.8825691 , 18.68729877, 10.29696169, 18.13428106])

In [37]:
from sklearn.metrics import mean_squared_error
print("RMSE-library:",np.sqrt(mean_squared_error(y_pred_library,y_test)))

RMSE-library: 3.5460811398199343
