## Load corpus


In [1]:
import os
from tqdm import tqdm
from corpus import download_and_unzip, catgeories, read_text_files, corpus_root

download_and_unzip()

Already downloaded and extracted!


In [2]:
reviews = []
labels = []

# we can't use the previous tokenizers here
# idx 0 -> neg, 1 -> pos
for idx, cat in enumerate(catgeories):
    path = os.path.join(corpus_root, cat)
    texts = read_text_files(path)

    for i in tqdm(range(len(texts)), desc="prepare_corpus"):
        text = texts[i]
        reviews.append(text)
        labels.append(idx)

print()
print(len(reviews))
print(len(labels))

prepare_corpus: 100%|█████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1000549.62it/s]
prepare_corpus: 100%|██████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 996982.17it/s]


2000
2000





## BERT Tokenizer
https://huggingface.co/transformers/model_doc/bert.html#berttokenizer

https://huggingface.co/transformers/preprocessing.html

In [3]:
from transformers import AutoTokenizer
import torch

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name) 

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Sequence Properties

In [4]:
MAX_LEN=512

## Split corpus


In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    reviews, labels, random_state=42, train_size=0.8
)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

## Dataloaders

In [6]:
import torch
from torch.utils.data import Dataset

# custom dataset
class PolarityReviewDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        # encode review text
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(label)
        }

training_dataset = PolarityReviewDataset(x_train, y_train, tokenizer)
val_dataset = PolarityReviewDataset(x_val, y_val, tokenizer)

In [7]:
from torch.utils.data import DataLoader
import multiprocessing

usable_cpu_cores = multiprocessing.cpu_count() - 1 # keep 1 free to keep system responsive

batch_size = 16

# loader from custom dataset
train_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)


## Classifier

In [8]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F

class SentiBERT(nn.Module):
    def __init__(self, model_name=model_name):
        super(SentiBERT, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        # freeze bert params
        for param in self.bert.parameters():
            param.requires_grad = False
        
        
        self.linear = nn.Linear(768, 1) # 768 for BERT, 1 for binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = out.pooler_output
        
        out = self.linear(out)
        out = self.sigmoid(out)

        return out

In [9]:
senti_bert = SentiBERT()

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

## Accelerate

In [10]:
from accelerate import Accelerator

accelerator = Accelerator()
device = accelerator.device

print(device)

cuda


## Hyperparams

In [11]:
learning_rate = 2e-5

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(senti_bert.parameters(), lr=learning_rate)

## Send to device

In [12]:
senti_bert, optimizer, train_loader, val_loader = accelerator.prepare(
    senti_bert, optimizer, train_loader, val_loader
)

## Train

In [13]:
import numpy as np

epochs = 2

def train(model, train_loader, val_loader, epochs, optimizer, loss_fn, accl=accelerator):
    print_counter = 0 # print loss for each 10th count

    for e in tqdm(range(epochs), desc=f"train_sentibert_for_{epochs}_epochs"):
        model.train()
        for td in train_loader:
            print_counter += 1

            # unpack data
            input_ids = td["input_ids"]
            attention_mask = td["attention_mask"]
            label = td["label"]

            # zero gradients
            model.zero_grad()

            # forward pass
            output = model(input_ids, attention_mask)
            # the max probability based class
            output, _ = torch.max(output, dim=1)

            # backprop
            loss = loss_fn(output, label.float())
            accl.backward(loss)
            
            #clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()

            # log loss 
            if print_counter % 10 == 0:
                validation_losses = []
                
                model.eval() # switch mode
                with torch.no_grad():
                    for td in val_loader:
                        # unpack data
                        input_ids = td["input_ids"]
                        attention_mask = td["attention_mask"]
                        label = td["label"]
                        
                        # repeat same steps from forward pass
                        out = model(input_ids, attention_mask)
                        out, _ = torch.max(out, dim=1)
                        val_loss = loss_fn(out, label.float())
                        
                        # add loss to validation losses
                        validation_losses.append(val_loss.item())
                    print(f"\nEpoch: {e + 1}/{epochs}\tStep: {print_counter}\tTrain Loss: {loss.item()}\tValidation Loss: {np.mean(validation_losses)}")

                model.train()


%time train(model=senti_bert, train_loader=train_loader, val_loader=val_loader, epochs=epochs, optimizer=optimizer, loss_fn=loss_fn)




Epoch: 1/2	Step: 10	Train Loss: 0.6906414031982422	Validation Loss: 0.6931169837713241

Epoch: 1/2	Step: 20	Train Loss: 0.6965962648391724	Validation Loss: 0.6933853656053544

Epoch: 1/2	Step: 30	Train Loss: 0.6966907382011414	Validation Loss: 0.6934567987918854

Epoch: 1/2	Step: 40	Train Loss: 0.6954180002212524	Validation Loss: 0.6935926228761673

Epoch: 1/2	Step: 50	Train Loss: 0.6958193778991699	Validation Loss: 0.6935993313789368

Epoch: 1/2	Step: 60	Train Loss: 0.694819450378418	Validation Loss: 0.6936604231595993

Epoch: 1/2	Step: 70	Train Loss: 0.694745659828186	Validation Loss: 0.6939232647418976


train_sentibert_for_2_epochs:  50%|█████████████████████████████                             | 1/2 [00:31<00:31, 31.62s/it]


Epoch: 1/2	Step: 80	Train Loss: 0.6986194849014282	Validation Loss: 0.6941703796386719

Epoch: 2/2	Step: 90	Train Loss: 0.6883164644241333	Validation Loss: 0.693990159034729

Epoch: 2/2	Step: 100	Train Loss: 0.6921550035476685	Validation Loss: 0.6941829115152359

Epoch: 2/2	Step: 110	Train Loss: 0.6983454823493958	Validation Loss: 0.6943405270576477

Epoch: 2/2	Step: 120	Train Loss: 0.6966814994812012	Validation Loss: 0.6945073157548904

Epoch: 2/2	Step: 130	Train Loss: 0.6908498406410217	Validation Loss: 0.6944680720567703

Epoch: 2/2	Step: 140	Train Loss: 0.6905251145362854	Validation Loss: 0.6947894126176835

Epoch: 2/2	Step: 150	Train Loss: 0.7020124197006226	Validation Loss: 0.6948093801736832


train_sentibert_for_2_epochs: 100%|██████████████████████████████████████████████████████████| 2/2 [01:02<00:00, 31.29s/it]


Epoch: 2/2	Step: 160	Train Loss: 0.6929541826248169	Validation Loss: 0.6944562345743179
Wall time: 1min 2s





## Inference

In [14]:
# test data is a list of reviews as strings
def classify_sentiment(model, test_data, tokenizer):
    prediction = []
    # switch model mode
    model.eval()
    with torch.no_grad():

        for i in tqdm(range(len(test_data)), desc="inference"):
            review = test_data[i]
            
            # encode data
            encoded = tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length=MAX_LEN,
                truncation=True,
                return_token_type_ids=False,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors="pt"
            )
            
            # unpack
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            
            # forward pass
            pred = model(input_ids, attention_mask)
            pred, _ = torch.max(pred, dim=1)
            # round to the nearest integer
            pred =  torch.round(pred.squeeze())
            
            
            # add to list
            prediction.append(pred.cpu().detach().numpy()) # dear pytorch team, find a easier wrapper please!
            
    return np.array(prediction)

In [15]:
y_pred = classify_sentiment(senti_bert, x_test, tokenizer)

inference: 100%|█████████████████████████████████████████████████████████████████████████| 400/400 [00:04<00:00, 83.97it/s]


In [16]:
y_pred = y_pred.reshape(-1, 1)

In [17]:
y_test = np.array(y_test).reshape(-1, 1)

## Evaluation

In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       199
           1       0.50      1.00      0.67       201

    accuracy                           0.50       400
   macro avg       0.25      0.50      0.33       400
weighted avg       0.25      0.50      0.34       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
