## Load corpus


In [1]:
import os
from tqdm import tqdm
from corpus import download_and_unzip, catgeories, read_text_files, corpus_root

download_and_unzip()

Already downloaded and extracted!


In [2]:
reviews = []
labels = []

# we can't use the previous tokenizers here
# idx 0 -> neg, 1 -> pos
for idx, cat in enumerate(catgeories):
    path = os.path.join(corpus_root, cat)
    texts = read_text_files(path)

    for i in tqdm(range(len(texts)), desc="prepare_corpus"):
        text = texts[i]
        reviews.append(text)
        labels.append(idx)

print()
print(len(reviews))
print(len(labels))

prepare_corpus: 100%|█████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2718278.68it/s]
prepare_corpus: 100%|█████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2772177.13it/s]


2000
2000





## BERT Tokenizer
https://huggingface.co/transformers/model_doc/bert.html#berttokenizer

https://huggingface.co/transformers/preprocessing.html

In [3]:
from transformers import AutoTokenizer
import torch

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name) 

## Sequence Properties

In [4]:
MAX_LEN=512

## Split corpus


In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    reviews, labels, random_state=42, train_size=0.8
)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

## Dataloaders

In [6]:
import torch
from torch.utils.data import Dataset

# custom dataset
class PolarityReviewDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        # encode review text
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(label)
        }

training_dataset = PolarityReviewDataset(x_train, y_train, tokenizer)
val_dataset = PolarityReviewDataset(x_val, y_val, tokenizer)

In [7]:
from torch.utils.data import DataLoader

batch_size = 16

# loader from custom dataset
train_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)


## Classifier

In [8]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F

class SentiBERT(nn.Module):
    def __init__(self, model_name=model_name):
        super(SentiBERT, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        # freeze bert params
        for name, param in self.bert.named_parameters():
            if "pooler" not in name:
                param.requires_grad = False
        
        
        self.linear = nn.Linear(768, 2) # 768 for BERT

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = out.pooler_output
        
        out = self.linear(out)

        return out

In [9]:
senti_bert = SentiBERT()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Accelerate

In [10]:
from accelerate import Accelerator

accelerator = Accelerator()
device = accelerator.device

print(device)

cuda


## Hyperparams

In [11]:
learning_rate = 2e-5

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(senti_bert.parameters(), lr=learning_rate)

## Send to device

In [12]:
senti_bert, optimizer, train_loader, val_loader = accelerator.prepare(
    senti_bert, optimizer, train_loader, val_loader
)

## Train

In [13]:
import numpy as np

epochs = 2

def train(model, train_loader, val_loader, epochs, optimizer, loss_fn, accl=accelerator):
    print_counter = 0 # print loss for each 10th count

    for e in tqdm(range(epochs), desc=f"train_sentibert_for_{epochs}_epochs"):
        model.train()
        for td in train_loader:
            print_counter += 1

            # unpack data
            input_ids = td["input_ids"]
            attention_mask = td["attention_mask"]
            label = td["label"]

            # zero gradients
            model.zero_grad()

            # forward pass
            output = model(input_ids, attention_mask)

            # backprop
            loss = loss_fn(output, label)
            accl.backward(loss)
            
            #clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()

            # log loss 
            if print_counter % 10 == 0:
                validation_losses = []
                
                model.eval() # switch mode
                with torch.no_grad():
                    for td in val_loader:
                        # unpack data
                        input_ids = td["input_ids"]
                        attention_mask = td["attention_mask"]
                        label = td["label"]
                        
                        # repeat same steps from forward pass
                        out = model(input_ids, attention_mask)
                        val_loss = loss_fn(out, label)
                        
                        # add loss to validation losses
                        validation_losses.append(val_loss.item())
                    print(f"\nEpoch: {e + 1}/{epochs}\tStep: {print_counter}\tTrain Loss: {loss.item()}\tValidation Loss: {np.mean(validation_losses)}")

                model.train()


%time train(model=senti_bert, train_loader=train_loader, val_loader=val_loader, epochs=epochs, optimizer=optimizer, loss_fn=loss_fn)

train_sentibert_for_2_epochs:   0%|                                                          | 0/2 [00:00<?, ?it/s]


Epoch: 1/2	Step: 10	Train Loss: 0.7365266680717468	Validation Loss: 0.7560615211725235

Epoch: 1/2	Step: 20	Train Loss: 0.708031177520752	Validation Loss: 0.7108944326639175

Epoch: 1/2	Step: 30	Train Loss: 0.6862413287162781	Validation Loss: 0.6879808068275451

Epoch: 1/2	Step: 40	Train Loss: 0.6274486184120178	Validation Loss: 0.6848599135875701

Epoch: 1/2	Step: 50	Train Loss: 0.6936777830123901	Validation Loss: 0.6847570776939392

Epoch: 1/2	Step: 60	Train Loss: 0.6818102598190308	Validation Loss: 0.6940467536449433

Epoch: 1/2	Step: 70	Train Loss: 0.7365671396255493	Validation Loss: 0.7146527618169785


train_sentibert_for_2_epochs:  50%|█████████████████████████                         | 1/2 [00:34<00:34, 34.89s/it]


Epoch: 1/2	Step: 80	Train Loss: 0.7659740447998047	Validation Loss: 0.7086085021495819

Epoch: 2/2	Step: 90	Train Loss: 0.6519452333450317	Validation Loss: 0.696696075797081

Epoch: 2/2	Step: 100	Train Loss: 0.6862115263938904	Validation Loss: 0.6906106293201446

Epoch: 2/2	Step: 110	Train Loss: 0.6796305775642395	Validation Loss: 0.6829357236623764

Epoch: 2/2	Step: 120	Train Loss: 0.6783004403114319	Validation Loss: 0.6795591175556183

Epoch: 2/2	Step: 130	Train Loss: 0.6807633638381958	Validation Loss: 0.680466291308403

Epoch: 2/2	Step: 140	Train Loss: 0.6979687213897705	Validation Loss: 0.680511337518692

Epoch: 2/2	Step: 150	Train Loss: 0.6860297918319702	Validation Loss: 0.6777825206518173


train_sentibert_for_2_epochs: 100%|██████████████████████████████████████████████████| 2/2 [01:09<00:00, 34.69s/it]


Epoch: 2/2	Step: 160	Train Loss: 0.6737222075462341	Validation Loss: 0.6767363011837005
CPU times: user 1min 9s, sys: 250 ms, total: 1min 9s
Wall time: 1min 9s





## Inference

In [14]:
# test data is a list of reviews as strings
import torch.nn.functional as F

def classify_sentiment(model, test_data, tokenizer):
    prediction = []
    # switch model mode
    model.eval()
    with torch.no_grad():

        for i in tqdm(range(len(test_data)), desc="inference"):
            review = test_data[i]
            
            # encode data
            encoded = tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length=MAX_LEN,
                truncation=True,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt"
            )
            
            # unpack
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            
            # forward pass
            pred = model(input_ids, attention_mask)
            pred = F.softmax(pred, dim=1)
            _, pred_label = torch.max(pred, dim=1)

            
            # add to list
            prediction.append(pred_label.cpu().detach().numpy()) # dear pytorch team, find a easier wrapper please!
            
    return np.array(prediction)

In [15]:
y_pred = classify_sentiment(senti_bert, x_test, tokenizer)

inference: 100%|█████████████████████████████████████████████████████████████████| 400/400 [00:04<00:00, 90.36it/s]


In [16]:
y_test = np.array(y_test).reshape(-1, 1)

## Evaluation

In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.57      0.79      0.66       199
           1       0.67      0.42      0.51       201

    accuracy                           0.60       400
   macro avg       0.62      0.60      0.59       400
weighted avg       0.62      0.60      0.59       400

