In [1]:
!pip install -q transformers pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!gdown 1e7iP_EWYV63rOHd-SF-5I1RF7IT2g84t

Downloading...
From: https://drive.google.com/uc?id=1e7iP_EWYV63rOHd-SF-5I1RF7IT2g84t
To: /content/data.csv
  0% 0.00/3.98M [00:00<?, ?B/s]100% 3.98M/3.98M [00:00<00:00, 82.5MB/s]


In [3]:
import torch
import pytorch_lightning as pl
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch

In [4]:
data = pd.read_csv("/content/data.csv",
                   lineterminator='\n')
print(data.head())

texts = data.normalized_content
texts = texts.to_list()
labels = data.score.to_list()
labels = [float(i)/10.0 for i in labels]
len(data)

                                  normalized_content  score
0                 Very friendly staff. Nice welcome.    8.0
1  staff were very helpful in booking the train o...    8.0
2  It was a superior experience. Accomodation was...   10.0
3  Very helpful owner, any minor issues in the ro...   10.0
4  The staff! They were amazing and so friendly, ...   10.0


14845

In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        assert input_ids.shape[0] == attention_mask.shape[0] == self.max_length

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.float)
        }



In [6]:
MODEL_NAME = "roberta-base"

class BERTClassifier(pl.LightningModule):
    def __init__(self, learning_rate=2e-5):
        super(BERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.linear1 = torch.nn.Linear(self.bert.config.hidden_size, 50)
        self.linear2 = torch.nn.Linear(50, 10)
        self.linear3 = torch.nn.Linear(10, 1)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        #print(outputs.last_hidden_state.shape)
        linear1 = self.linear1(outputs.last_hidden_state[:, 0])
        linear2 = self.linear2(linear1)
        linear3 = self.linear3(linear2)
        return linear3

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        loss = F.mse_loss(predicted_value, labels)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        val_loss = F.mse_loss(predicted_value, labels)
        self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True)
        return val_loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        test_loss = F.mse_loss(predicted_value, labels)
        self.log("test_loss", test_loss, on_step=False, on_epoch=True, prog_bar=True)
        return test_loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer


class BERTDataModule(pl.LightningDataModule):
    def __init__(self, texts, labels, batch_size, max_length, num_workers):
        super().__init__()
        self.texts = texts
        self.labels = labels
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        self.num_workers = num_workers

    def setup(self, stage=None):
        texts_train, texts_val, labels_train, labels_val = train_test_split(self.texts, self.labels, test_size=0.2, random_state=0)
        texts_test, texts_val, labels_test, labels_val = train_test_split(self.texts, self.labels, test_size=0.25, random_state=0)
        self.train_dataset = CustomDataset(texts_train, labels_train, self.tokenizer, self.max_length)
        self.val_dataset = CustomDataset(texts_val, labels_val, self.tokenizer, self.max_length)
        self.test_dataset = CustomDataset(texts_test, labels_test, self.tokenizer, self.max_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

In [7]:
batch_size = 48
max_length = 128
num_workers = 1

data_module = BERTDataModule(texts, labels, batch_size, max_length, num_workers)
model = BERTClassifier()

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_checkpoint = ModelCheckpoint(dirpath='checkpoint/',
                                   monitor="val_loss",
                                   verbose=True,
                                   mode="min",
                                   save_top_k=1
                                   )
early_stopping = EarlyStopping(monitor="val_loss", mode="min", min_delta=1e-4, patience=5)
callbacks = [model_checkpoint, early_stopping]

trainer = pl.Trainer(max_epochs=5, detect_anomaly=True, callbacks=callbacks)
trainer.fit(model, data_module)

INFO:pytorch_lightning.utilities.rank_zero:You have turned on `Trainer(detect_anomaly=True)`. This will significantly slow down compute speed and is recommended only for model debugging.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type         | Params
-----------------------------------------
0 | bert    | RobertaModel | 124 M 
1 | linear1 | Linear       | 38.5 K
2 | linear2 | Linear       | 510   
3 | linear3 | Linear       | 11    
-----------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.738   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
df1 = print(trainer.test(ckpt_path="best", datamodule = data_module))
df1

#Bài tập


1. Dùng bert-base-cased để chạy lại model với input length là 256 trên và so sánh kết quả


In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
MODEL_NAME = "bert-base-cased"

class BERTClassifier(pl.LightningModule):
    def __init__(self, learning_rate=2e-5):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.linear1 = torch.nn.Linear(self.bert.config.hidden_size, 50)
        self.linear2 = torch.nn.Linear(50, 10)
        self.linear3 = torch.nn.Linear(10, 1)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        #print(outputs.last_hidden_state.shape)
        linear1 = self.linear1(outputs.last_hidden_state[:, 0])
        linear2 = self.linear2(linear1)
        linear3 = self.linear3(linear2)
        return linear3

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        loss = F.mse_loss(predicted_value, labels)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        val_loss = F.mse_loss(predicted_value, labels)
        self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True)
        return val_loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        test_loss = F.mse_loss(predicted_value, labels)
        self.log("test_loss", test_loss, on_step=False, on_epoch=True, prog_bar=True)
        return test_loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer


class BERTDataModule(pl.LightningDataModule):
    def __init__(self, texts, labels, batch_size, max_length, num_workers):
        super().__init__()
        self.texts = texts
        self.labels = labels
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
        self.num_workers = num_workers

    def setup(self, stage=None):
        texts_train, texts_val, labels_train, labels_val = train_test_split(self.texts, self.labels, test_size=0.2, random_state=0)
        texts_test, texts_val, labels_test, labels_val = train_test_split(self.texts, self.labels, test_size=0.25, random_state=0)
        self.train_dataset = CustomDataset(texts_train, labels_train, self.tokenizer, self.max_length)
        self.val_dataset = CustomDataset(texts_val, labels_val, self.tokenizer, self.max_length)
        self.test_dataset = CustomDataset(texts_test, labels_test, self.tokenizer, self.max_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

In [None]:
batch_size = 48
max_length = 256
num_workers = 1

data_module = BERTDataModule(texts, labels, batch_size, max_length, num_workers)
model = BERTClassifier()

In [None]:
model_checkpoint = ModelCheckpoint(dirpath='checkpoint/',
                                   monitor="val_loss",
                                   verbose=True,
                                   mode="min",
                                   save_top_k=1
                                   )
early_stopping = EarlyStopping(monitor="val_loss", mode="min", min_delta=1e-4, patience=5)
callbacks = [model_checkpoint, early_stopping]

trainer = pl.Trainer(max_epochs=5, detect_anomaly=True, callbacks=callbacks)
trainer.fit(model, data_module)

In [None]:
df2 = print(trainer.test(ckpt_path="best", datamodule = data_module))
df2

2. Thay đổi các lớp FC cuối cùng theo thứ tự sau 100, 50, 20, 10.

###Áp dụng trên model Bert-case

In [None]:
MODEL_NAME = "bert-base-cased"

class BERTClassifier(pl.LightningModule):
    def __init__(self, learning_rate=2e-5):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.linear1 = torch.nn.Linear(self.bert.config.hidden_size, 100)
        self.linear2 = torch.nn.Linear(100, 50)
        self.linear3 = torch.nn.Linear(50, 20)
        self.linear4 = torch.nn.Linear(20, 10)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        #print(outputs.last_hidden_state.shape)
        linear1 = self.linear1(outputs.last_hidden_state[:, 0])
        linear2 = self.linear2(linear1)
        linear3 = self.linear3(linear2)
        linear4 = self.linear4(linear3)
        return linear3

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        loss = F.mse_loss(predicted_value, labels)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        val_loss = F.mse_loss(predicted_value, labels)
        self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True)
        return val_loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = torch.reshape(batch['label'], (-1, 1))

        predicted_value = self(input_ids, attention_mask)
        test_loss = F.mse_loss(predicted_value, labels)
        self.log("test_loss", test_loss, on_step=False, on_epoch=True, prog_bar=True)
        return test_loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer


class BERTDataModule(pl.LightningDataModule):
    def __init__(self, texts, labels, batch_size, max_length, num_workers):
        super().__init__()
        self.texts = texts
        self.labels = labels
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
        self.num_workers = num_workers

    def setup(self, stage=None):
        texts_train, texts_val, labels_train, labels_val = train_test_split(self.texts, self.labels, test_size=0.2, random_state=0)
        texts_test, texts_val, labels_test, labels_val = train_test_split(self.texts, self.labels, test_size=0.25, random_state=0)
        self.train_dataset = CustomDataset(texts_train, labels_train, self.tokenizer, self.max_length)
        self.val_dataset = CustomDataset(texts_val, labels_val, self.tokenizer, self.max_length)
        self.test_dataset = CustomDataset(texts_test, labels_test, self.tokenizer, self.max_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

In [None]:
batch_size = 32
max_length = 256
num_workers = 1

data_module = BERTDataModule(texts, labels, batch_size, max_length, num_workers)
model = BERTClassifier()

In [None]:
model_checkpoint = ModelCheckpoint(dirpath='checkpoint/',
                                   monitor="val_loss",
                                   verbose=True,
                                   mode="min",
                                   save_top_k=1
                                   )
early_stopping = EarlyStopping(monitor="val_loss", mode="min", min_delta=1e-4, patience=5)
callbacks = [model_checkpoint, early_stopping]

trainer = pl.Trainer(max_epochs=5, detect_anomaly=True, callbacks=callbacks)
trainer.fit(model, data_module)

In [None]:
df3 = print(trainer.test(ckpt_path="best", datamodule = data_module))
df3