# 1. Tải bộ dữ liệu

In [1]:
!gdown -q 1qiUDDoYyRLBiKOoYWdFl_5WByHE8Cugu

# 2. Import các thư viện cần thiết

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 3. Cài đặt giá trị ngẫu nhiên cố định

In [3]:
random_state = 59
np.random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

# 4. Cài đặt thiết bị tính toán

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 5. Đọc bộ dữ liệu

In [5]:
dataset_path = '/content/Auto_MPG_data.csv'
dataset = pd.read_csv(dataset_path)
dataset

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
0,18.0,8,307.0,130.0,3504.0,12.0,70,0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,0,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,0,0,1
...,...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86.0,2790.0,15.6,82,0,0,1
388,44.0,4,97.0,52.0,2130.0,24.6,82,1,0,0
389,32.0,4,135.0,84.0,2295.0,11.6,82,0,0,1
390,28.0,4,120.0,79.0,2625.0,18.6,82,0,0,1


# 6. Tiền xử lí bộ dữ liệu

In [6]:
X = dataset.drop('MPG', axis=1).values
y = dataset['MPG'].values

In [7]:
val_size = .2
test_size = .125
is_shuffle = True

# Do bên kia làm thế chứ không thực sự muốn đảo lại thứ tự cho hợp logic
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, shuffle=is_shuffle,
                                                  random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=test_size, shuffle=is_shuffle,
                                                random_state=random_state)

In [8]:
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# 7. Xây dựng DataLoader

In [9]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [10]:
batch_size = 32
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 8. Xây dựng mạng MLP

In [11]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        out = self.output(x)
        return out.squeeze(1) # Loại bỏ các chiều có kích thước bằng 1, kiểu [[1, 2, 3]] sẽ thành [1, 2, 3]

In [12]:
input_dims = X.shape[1]
hidden_dims = 64
output_dims = 1

model = MLP(input_dims, hidden_dims, output_dims).to(device)

# 9. Khai báo hàm loss và optimizer

In [13]:
lr = 1e-2
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# 10. Xây dựng hàm tính điểm R2

In [14]:
def r_squared(y_pred, y_true):
    y_true = torch.tensor(y_true, dtype=torch.float32).to(device)
    y_pred = torch.tensor(y_pred, dtype=torch.float32).to(device)
    mean_true = torch.mean(y_true)
    ss_tot = torch.sum((y_true - mean_true) ** 2)
    ss_res = torch.sum((y_true - y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

# 11. Huấn luyện mô hình

In [15]:
epochs = 100
train_losses = []
val_losses = []
train_r2 = []
val_r2 = []

for epoch in range(epochs):
    train_loss = 0
    train_target = []
    train_predict = []
    val_loss = 0
    val_target = []
    val_predict = []
    model.train()
    for X_samples, y_samples in train_loader:
        X_samples = X_samples.to(device)
        y_samples = y_samples.to(device)
        optimizer.zero_grad()
        output = model(X_samples)
        train_predict += output.tolist()
        train_target += y_samples.tolist()
        loss = criterion(output, y_samples)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    train_r2.append(r_squared(train_predict, train_target))

    model.eval()
    with torch.no_grad():
        for X_samples, y_samples in val_loader:
            X_samples = X_samples.to(device)
            y_samples = y_samples.to(device)
            output = model(X_samples)
            val_predict += output.tolist()
            val_target += y_samples.tolist()
            loss = criterion(output, y_samples)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    val_r2.append(r_squared(val_predict, val_target))
    print(f'\nEPOCH {epoch + 1}:\tTraining loss: {train_loss:.3f}\tValidation loss: {val_loss:.3f}')


EPOCH 1:	Training loss: 370.184	Validation loss: 366.810

EPOCH 2:	Training loss: 108.023	Validation loss: 23.386

EPOCH 3:	Training loss: 117.756	Validation loss: 8.579

EPOCH 4:	Training loss: 15.851	Validation loss: 90.674

EPOCH 5:	Training loss: 21.286	Validation loss: 43.918

EPOCH 6:	Training loss: 26.367	Validation loss: 27.560

EPOCH 7:	Training loss: 22.069	Validation loss: 77.718

EPOCH 8:	Training loss: 19.093	Validation loss: 19.518

EPOCH 9:	Training loss: 18.030	Validation loss: 4.561

EPOCH 10:	Training loss: 13.548	Validation loss: 4.855

EPOCH 11:	Training loss: 14.068	Validation loss: 4.925

EPOCH 12:	Training loss: 9.779	Validation loss: 16.828

EPOCH 13:	Training loss: 15.061	Validation loss: 9.306

EPOCH 14:	Training loss: 16.595	Validation loss: 4.251

EPOCH 15:	Training loss: 7.914	Validation loss: 5.058

EPOCH 16:	Training loss: 12.166	Validation loss: 3.816

EPOCH 17:	Training loss: 8.639	Validation loss: 6.018

EPOCH 18:	Training loss: 10.666	Validation loss

# 12. Đánh giá mô hình

In [19]:
model.eval()
with torch.no_grad():
    y_hat = model(X_test.to(device))
    test_set_r2 = r_squared(y_test, y_hat)
    print('Evaluation on test set:')
    print(f'R2 score: {test_set_r2}')

Evaluation on test set:
R2 score: 0.78011554479599


  y_true = torch.tensor(y_true, dtype=torch.float32).to(device)
  y_pred = torch.tensor(y_pred, dtype=torch.float32).to(device)
