# Bai 2

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo

In [50]:
auto_mpg = fetch_ucirepo(id=9)

In [51]:
data = auto_mpg.data.original
data.head()

Unnamed: 0,car_name,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,"chevrolet,chevelle,malibu",8,307.0,130.0,3504,12.0,70,1,18.0
1,"buick,skylark,320",8,350.0,165.0,3693,11.5,70,1,15.0
2,"plymouth,satellite",8,318.0,150.0,3436,11.0,70,1,18.0
3,"amc,rebel,sst",8,304.0,150.0,3433,12.0,70,1,16.0
4,"ford,torino",8,302.0,140.0,3449,10.5,70,1,17.0


## Tien xu ly
Because the **horsepower** column has missing values, I need to count the number of missing rows. If it's less than **5%**, I will delete the missing values. 

In [52]:
num_missing = len(data[data["horsepower"].isnull() == True])
print(f"Number of rows with missing values: {num_missing} rows")

Number of rows with missing values: 6 rows


In [53]:
new_data = data.dropna()

print(f"Initial number of lines: {data.shape[0]}")
print(f"Number of lines after deletion: {new_data.shape[0]}")

Initial number of lines: 398
Number of lines after deletion: 392


In dataset, we can see the **car_name** column, which is a ID, so we can delete that column because the algorithm cannot learn anything from it.  

In [54]:
new_data = new_data.drop("car_name", axis=1)
new_data

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504,12.0,70,1,18.0
1,8,350.0,165.0,3693,11.5,70,1,15.0
2,8,318.0,150.0,3436,11.0,70,1,18.0
3,8,304.0,150.0,3433,12.0,70,1,16.0
4,8,302.0,140.0,3449,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1,27.0
394,4,97.0,52.0,2130,24.6,82,2,44.0
395,4,135.0,84.0,2295,11.6,82,1,32.0
396,4,120.0,79.0,2625,18.6,82,1,28.0


1. mpg:           continuous
2. cylinders:     multi-valued discrete
3. displacement:  continuous
4. horsepower:    continuous
5. weight:        continuous
6. acceleration:  continuous
7. model year:    multi-valued discrete
8. origin:        multi-valued discrete
9. car name:      string (unique for each instance)

The "cylinders" and "origin" column has 8 unique values, so we need to use one-hot encoding.

In [57]:
df = pd.get_dummies(
    new_data,
    columns=["origin", "cylinders"],
    drop_first=True,
    dtype=float
)
df

Unnamed: 0,displacement,horsepower,weight,acceleration,model_year,mpg,origin_2,origin_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8
0,307.0,130.0,3504,12.0,70,18.0,0.0,0.0,0.0,0.0,0.0,1.0
1,350.0,165.0,3693,11.5,70,15.0,0.0,0.0,0.0,0.0,0.0,1.0
2,318.0,150.0,3436,11.0,70,18.0,0.0,0.0,0.0,0.0,0.0,1.0
3,304.0,150.0,3433,12.0,70,16.0,0.0,0.0,0.0,0.0,0.0,1.0
4,302.0,140.0,3449,10.5,70,17.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
393,140.0,86.0,2790,15.6,82,27.0,0.0,0.0,1.0,0.0,0.0,0.0
394,97.0,52.0,2130,24.6,82,44.0,1.0,0.0,1.0,0.0,0.0,0.0
395,135.0,84.0,2295,11.6,82,32.0,0.0,0.0,1.0,0.0,0.0,0.0
396,120.0,79.0,2625,18.6,82,28.0,0.0,0.0,1.0,0.0,0.0,0.0


## Tinh toan

We will split the dataset into an 80/20 split, 80% for training and 20% for comparing the algorithm's results.

In [72]:
size = int(len(df) * 0.8)

train_set = df[:size]
test_set = df[size:]

X_train = train_set.drop("mpg", axis=1).to_numpy()
y_train = train_set["mpg"].to_numpy()

X_test = test_set.drop("mpg", axis=1).to_numpy()
y_test = test_set["mpg"].to_numpy()

Formula: $\hat{y} = \mathbf{X}\mathbf{w} + b$

$\mathbf{w} = (\mathbf{X}^T\mathbf{X})^{-1}\,\mathbf{X}^T\mathbf{y}$

In [76]:
one = np.ones((X_train.shape[0], 1))
Xbar = np.concatenate((one, X_train), axis=1)

# calculating weight of the fitting line
A = np.dot(Xbar.T, Xbar)
b = np.dot(Xbar.T, y_train)
w = np.dot(np.linalg.pinv(A), b)

In [84]:
one = np.ones((X_test.shape[0], 1))
Xbar_test = np.concatenate((one, X_test), axis=1)

y_pred = Xbar_test @ w
y_pred

array([25.72616823, 21.15297087, 30.47298582, 29.03687648, 29.94689055,
       30.21340185, 31.47713866, 31.97052628, 27.02584441, 31.77655466,
       31.10533769, 29.63162307, 24.97649069, 25.41796741, 33.76775913,
       31.82501404, 32.65802902, 23.85265881, 22.81082833, 28.97719298,
       31.19782652, 29.41069458, 28.77392457, 28.89787414, 24.50925243,
       30.30771275, 34.54141045, 32.46829272, 34.5804674 , 32.79619732,
       32.95278744, 33.03648449, 33.34021512, 31.27894246, 31.76715912,
       29.53418478, 31.46828532, 32.40489819, 31.40757445, 29.6633351 ,
       29.9876036 , 25.77284084, 22.71824975, 24.84985204, 24.18736198,
       21.32456153, 21.80866045, 23.30100256, 21.7873901 , 28.51304197,
       28.48831083, 29.73961295, 29.24277619, 29.763553  , 28.42377978,
       27.857249  , 32.87383298, 33.39972785, 33.74664369, 32.29772502,
       31.61800967, 32.86158663, 32.97652699, 32.62951408, 34.17415942,
       34.07485298, 33.86112198, 23.56135499, 24.78780562, 29.61

## So sach gia tri du doan voi gia tri that

| R²  | Meaning                 |
| --- | ----------------------- |
| 1.0 | Perfect fit             |
| 0.0 | Same as predicting mean |
| < 0 | Worse than mean         |


In [87]:
# R^2
ss_res = np.sum((y_test - y_pred)**2)
ss_tot = np.sum((y_test - np.mean(y_test))**2)
r2 = 1 - ss_res / ss_tot

comparison = pd.DataFrame({"Predicted values": y_pred, "True value": y_test, "R^2": r2})
comparison

Unnamed: 0,Predicted values,True value,R^2
0,25.726168,24.3,0.307698
1,21.152971,19.1,0.307698
2,30.472986,34.3,0.307698
3,29.036876,29.8,0.307698
4,29.946891,31.3,0.307698
...,...,...,...
74,28.536918,27.0,0.307698
75,31.531123,44.0,0.307698
76,31.482548,32.0,0.307698
77,28.949845,28.0,0.307698
