In [1]:
import pandas as pd
from tqdm.auto import tqdm
import random
import torch
from collections import defaultdict
import os
import numpy as np
import random
import matplotlib.pyplot as plt
from transformers import BertTokenizer,BertModel

In [2]:
!pip install lightning
!pip install torchmetrics
!pip install huggingface_hub

Collecting lightning
  Downloading lightning-2.2.5-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading lightning-2.2.5-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.2.5


In [53]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(42)

In [5]:

data = pd.read_csv('/kaggle/input/aspect-data/aspect_based_data.csv')
products = []
for idx,(product, group_data) in tqdm(enumerate(data.groupby("product"))):
    products.append(product)
random.shuffle(products)

train_ratio = 0.75
val_ratio = 0.15

size = len(products)

train = data.loc[data['product'].isin(products[:int(size*train_ratio)])]
val = data.loc[data['product'].isin(products[int(size*train_ratio):int(size*(train_ratio+val_ratio))])]
test = data.loc[data['product'].isin(products[int(size*(train_ratio+val_ratio)):])]

print(f"Number of train products: {len(train.groupby('product'))} --- Size: {len(train)}")
print(f"Number of validation products: {len(val.groupby('product'))} --- Size: {len(val)}")
print(f"Number of test products: {len(test.groupby('product'))} --- Size: {len(test)}")

0it [00:00, ?it/s]

Number of train products: 21 --- Size: 732
Number of validation products: 5 --- Size: 197
Number of test products: 3 --- Size: 71


In [28]:
import torch
from torch.utils.data import Dataset
import numpy as np
class ABSADataset(Dataset):
    def __init__(self, df):
        self.df = self.setup(df)
        
    
    def setup(self,df):
        new_df = []
        for i,row in df.iterrows():
            tokens, tags, pols = row[1:].values
            tokens = tokens.replace("'", "").strip("][").split(', ')
            tags = tags.strip('][').replace("'", "").split(', ')
            pols = pols.strip('][').replace("'", "").split(', ')
            tags = [int(i) for i in tags]
            if sum(tags) != 0 and '0' not in pols:
                new_df.append([tokens,tags,pols])
        return new_df

    def __getitem__(self, idx):
        tokens, tags, pols = self.df[idx]

      
       
        bert_att = []
        pols_labels = []
        start_ids = 0
        end_ids = -1
        for i in range(len(tokens)):
            if int(tags[i]) == 1:
                start_ids = i
                end_ids = i
            elif int(tags[i]) == 2:
                end_ids += 1
            elif int(tags[i]) == 0:
                if start_ids <= end_ids:
                    bert_att.append(tokens[start_ids:end_ids+1])
                    pols_labels.append(int(pols[start_ids]) - 1 )
                    end_ids = -1
        if start_ids <= end_ids:
            bert_att.append(tokens[start_ids:])
            pols_labels.append(int(pols[start_ids]) - 1 ) 
       
        id = np.random.randint(0,len(bert_att)) # in one sentence has one or more than one aspects
        aspect = " ".join(bert_att[id])
        pols_label = pols_labels[id]
       
        
        

        return " ".join(tokens),aspect,torch.tensor(pols_label)

    def __len__(self):
        return len(self.df)

## Build model

In [14]:
from transformers import BertModel
from torch import nn
import torch
from torch.utils.data import Dataset , DataLoader
from huggingface_hub import PyTorchModelHubMixin

class ABSABert(nn.Module,PyTorchModelHubMixin):
    def __init__(self,num_classes = 2):
        super(ABSABert,self).__init__()
        self.feature_extractor = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.head = nn.Linear(768*2,num_classes)
       
    def forward(self, inputs1,inputs2):
        outputs = self.feature_extractor(**inputs1)
        cls_token1 = outputs.last_hidden_state[:,0]
        
        outputs2 = self.feature_extractor(**inputs2)
        cls_token2 = outputs2.last_hidden_state[:,0]
        
        y = self.head(torch.cat([cls_token1,cls_token2],dim = -1)) # bs,d
        return y

In [46]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(samples,tokenizer):
    text = [s[0] for s in samples]
    aspect = [s[1] for s in samples]
    labels = torch.stack([s[2] for s in samples])
    text_inputs = tokenizer(text,padding = True,truncation = True,max_length = 256,return_tensors="pt")
    aspect_inputs = tokenizer(aspect,padding = True,truncation = True,max_length = 256,return_tensors="pt")
    return text_inputs,aspect_inputs,labels


In [47]:
import lightning.pytorch as L
from torchmetrics.functional import accuracy, f1_score
class ABSABert_Module(L.LightningModule):
    def __init__(self,train_data,val_data,test_data,lr,weight_decay,max_epochs,**kwargs):
        self.save_hyperparameters()
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data
        self.lr = lr
        self.weight_decay = weight_decay
        self.max_epochs = max_epochs
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.loss_func = nn.CrossEntropyLoss()
        self.model = ABSABert(num_classes = n_classes)
        self.num_classes = n_classes
    def setup(self, stage: str):
        if stage == "fit":
            self.train_dataset = ABSADataset(
                self.train_data
                )
            self.val_dataset = ABSADataset(
                self.val_data
                )
        if stage == "test":
            self.test_dataset = ABSADataset(
                self.test_data 
                )
    def forward(self,inputs1,inputs2):
        return self.model(inputs1,inputs2)

    def training_step(self, batch, batch_idx):
        inputs1,inputs2, label_ids = batch
        y = label_ids
        y_hat = self(inputs1,inputs2)
        loss = self.loss_func(y_hat, label_ids)       
        y_pred = torch.softmax(y_hat, dim=1)

        # Logging to TensorBoard by default
        self.log("train_loss", loss, prog_bar=True,on_step=True, on_epoch=True)
        self.log("train_acc", accuracy(y_pred, y, task="multiclass", num_classes=self.num_classes,ignore_index = 100), prog_bar=True,on_step=True, on_epoch=True)
        self.log("train_f1", f1_score(y_pred,y,average = 'macro', task="multiclass", num_classes=self.num_classes,ignore_index=100), prog_bar=True,on_step=True, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        inputs1,inputs2, label_ids = batch
      
        y = label_ids
        y_hat = self(inputs1,inputs2)
        loss = self.loss_func(y_hat, label_ids)       
        y_pred = torch.softmax(y_hat, dim=1)

        y_pred = torch.softmax(y_hat, dim=1)
       
        
        # Logging to TensorBoard by default
        self.log("val_loss", loss, prog_bar=True,on_step=False, on_epoch=True)
        self.log("val_acc", accuracy(y_pred, y, task="multiclass", num_classes=self.num_classes,ignore_index=100), prog_bar=True,on_step=False, on_epoch=True)
        self.log("val_f1", f1_score(y_pred, y,average = 'macro', task="multiclass", num_classes=self.num_classes,ignore_index=100), prog_bar=True,on_step=False, on_epoch=True)

        return loss
    def test_step(self, batch, batch_idx):
        # OPTIONAL
        inputs1,inputs2, label_ids = batch
        y = label_ids
        y_hat = self(inputs1,inputs2)
        loss = self.loss_func(y_hat, label_ids)       
        y_pred = torch.softmax(y_hat, dim=1)

        self.log("test_acc", accuracy(y_pred, y, task="multiclass", num_classes=self.num_classes,ignore_index=100), prog_bar=True,on_step=False, on_epoch=True)
        self.log("test_f1", f1_score(y_pred, y,average = 'macro', task="multiclass", num_classes=self.num_classes,ignore_index=100), prog_bar=True,on_step=False, on_epoch=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        if self.max_epochs is not None:
            lr_scheduler = torch.optim.lr_scheduler.StepLR(
                optimizer=optimizer,step_size = 10, gamma=0.7
            )
            return [optimizer], [lr_scheduler]
        else:
            return optimizer
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=16,shuffle = True,collate_fn=lambda batch: collate_fn(batch, self.tokenizer))

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=16,collate_fn=lambda batch: collate_fn(batch, self.tokenizer))

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=16,collate_fn=lambda batch: collate_fn(batch, self.tokenizer))
        

In [48]:
import lightning.pytorch as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from  lightning.pytorch.callbacks import ModelCheckpoint

lr = 1e-5
weight_decay = 1e-4
max_epochs = 60
n_classes = 2 #Positive, Negative

trainer = L.Trainer(
    callbacks=[
        EarlyStopping(monitor="val_loss", mode="min",min_delta=0.00, patience=15,check_finite = True ),
        ModelCheckpoint(dirpath="/kaggle/working/", save_top_k=1, monitor="val_loss",save_last = True)
              ],
    max_epochs=max_epochs,
    accelerator="auto", devices='auto',
    gradient_clip_val=1,
    log_every_n_steps = 1
)
model = ABSABert_Module(train,val,test,lr,weight_decay,max_epochs)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [49]:
trainer.fit(model)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name      | Type             | Params
-----------------------------------------------
0 | loss_func | CrossEntropyLoss | 0     
1 | model     | ABSABert         | 177 M 
-----------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.426   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [50]:
trainer.test(model,ckpt_path = '/kaggle/working/epoch=4-step=185.ckpt')

INFO: Restoring states from the checkpoint path at /kaggle/working/epoch=4-step=185.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at /kaggle/working/epoch=4-step=185.ckpt
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_acc': 0.914893627166748, 'test_f1': 0.8806527256965637}]

In [54]:
model = ABSABert_Module.load_from_checkpoint("/kaggle/working/epoch=4-step=185.ckpt")
# save locally
model.model.save_pretrained("Aspect_Based_Sentiment_Analysis_for_Reviews")

# push to the hub
model.model.push_to_hub("Aspect_Based_Sentiment_Analysis_for_Reviews")

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NCTuanAnh/Aspect_Based_Sentiment_Analysis_for_Reviews/commit/6888255274fe2bdeb2022528e2e98f26e97d8f8e', commit_message='Push model using huggingface_hub.', commit_description='', oid='6888255274fe2bdeb2022528e2e98f26e97d8f8e', pr_url=None, pr_revision=None, pr_num=None)

In [48]:
def predict(model,sentence,aspect ,device = 'cpu'):
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    sentence = sentence.strip().lower()
    aspect = aspect.strip().lower()

    text_inputs = tokenizer(sentence,padding = True,truncation = True,max_length = 256,return_tensors="pt")
    aspect_inputs = tokenizer(aspect,padding = True,truncation = True,max_length = 256,return_tensors="pt")
    
    output = model(text_inputs,aspect_inputs).argmax(dim = -1)
    print(output)
    
    
    
    

In [26]:
model = ABSABert.from_pretrained('NCTuanAnh/Aspect_Based_Sentiment_Analysis_for_Reviews')

In [52]:
predict(model,'Sản phẩm này rất tốt','sản phẩm')
print("1")

tensor([0])
1
