In [1]:

import os
from tqdm import tqdm
from corpus import download_and_unzip, catgeories, read_text_files, corpus_root

download_and_unzip()

Already downloaded and extracted!


In [2]:
import numpy as np

reviews = []
labels = []

# we can't use the previous tokenizers here
# idx 0 -> neg, 1 -> pos
for idx, cat in enumerate(catgeories):
    path = os.path.join(corpus_root, cat)
    texts = read_text_files(path)

    for i in tqdm(range(len(texts)), desc="prepare_corpus"):
        text = texts[i]
        reviews.append(text)
        labels.append(idx)

prepare_corpus: 100%|██████████| 1000/1000 [00:00<00:00, 2584290.82it/s]
prepare_corpus: 100%|██████████| 1000/1000 [00:00<00:00, 2318576.01it/s]


In [3]:
import torch
from transformers import LongformerTokenizer, LongformerModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pretrained_model_name = "allenai/longformer-base-4096"

tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name)

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    reviews, labels, random_state=42, train_size=0.8
)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

In [6]:
import torch
from torch.utils.data import Dataset

# custom dataset
class PolarityReviewDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        label = torch.tensor(label)
        label = torch.nn.functional.one_hot(label, num_classes=2)

        # encode review text
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=1600,
            truncation=True,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": label.float()
        }

In [7]:
training_dataset = PolarityReviewDataset(x_train, y_train, tokenizer)
val_dataset = PolarityReviewDataset(x_val, y_val, tokenizer)

In [8]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.optim import Adam

class SentiBERT(pl.LightningModule):
    def __init__(self, model_path=pretrained_model_name):
        super(SentiBERT, self).__init__()

        self.longformer = LongformerModel.from_pretrained(model_path)
        self.linear = nn.Linear(768, 2) 
        self.softmax = nn.Softmax(dim=-1)
        
        
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask):
        out = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        out = out.pooler_output
        
        out = self.linear(out)
        out = self.softmax(out)

        return out
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=2e-5)
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attn_masks = batch["attention_mask"]
        labels = batch["label"]
        
        logits = self(input_ids, attn_masks)
        loss = self.criterion(logits, labels)
        
        logs = {"train_loss": loss}
        return {
            "loss": loss,
            "log": logs
        } 
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attn_masks = batch["attention_mask"]
        labels = batch["label"]
        
        logits = self(input_ids, attn_masks)
        loss = self.criterion(logits, labels)
        
        self.log("val_loss", loss, prog_bar=True)
    
    
senti_bert = SentiBERT()
# test with sample input
sample_inp = tokenizer.encode_plus("This is a sample text", return_tensors="pt")
logits = senti_bert(**sample_inp)

logits

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[0.5367, 0.4633]], grad_fn=<SoftmaxBackward0>)

In [9]:
from torch.utils.data import DataLoader

batch_size = 8
torch.backends.cudnn.benchmark = True

# loader from custom dataset
train_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)

trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=4,
    precision=16,
    log_every_n_steps=10)

trainer.fit(senti_bert, train_loader, val_loader)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | longformer | LongformerModel  | 148 M 
1 | linear     | Linear           | 1.5 K 
2 | softmax    | Softmax          | 0     
3 | criterion  | CrossEntropyLoss | 0     
------------------------------------------------
148 M     Trainable params
0         Non-trainable params
148 M     Total params
297.322   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 3: 100%|██████████| 200/200 [02:38<00:00,  1.26it/s, loss=0.409, v_num=1, val_loss=0.414]


In [36]:
# test data is a list of reviews as strings
def classify_sentiment(model, test_data, tokenizer):
    prediction = []
    # switch model mode
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.eval()
    with torch.no_grad():

        for i in tqdm(range(len(test_data)), desc="inference"):
            review = test_data[i]
            
            # encode data
            encoded = tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length=1600,
                truncation=True,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt"
            )
            
            # unpack
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            
            # forward pass
            out = model(input_ids, attention_mask)
            _, idx = torch.max(out, dim=-1)
            # dear pytorch team, find a easier wrapper please!
            pred = idx.detach().cpu().numpy()
            
            # add to list
            prediction.append(pred) 
            
    return np.array(prediction)

In [35]:
# x = torch.Tensor([[0.5367, 0.4633]])
# v, idx = torch.max(x, dim=-1)

# idx.numpy()

array([0])

In [37]:
y_pred = classify_sentiment(senti_bert, x_test, tokenizer)

inference: 100%|██████████| 400/400 [00:20<00:00, 19.84it/s]


In [38]:
y_pred[:10]

array([[1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1]])

In [17]:
# y_pred = y_pred.reshape(-1, 1)
# y_pred.shape

(400, 1)

In [39]:
y_test = np.array(y_test).reshape(-1, 1)
y_test[:10]

array([[1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1]])

In [40]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       199
           1       0.87      0.93      0.90       201

    accuracy                           0.90       400
   macro avg       0.90      0.89      0.89       400
weighted avg       0.90      0.90      0.89       400

