## Load corpus


In [1]:
import os
from tqdm import tqdm
from corpus import download_and_unzip, catgeories, read_text_files, corpus_root

download_and_unzip()

Already downloaded and extracted!


In [2]:
reviews = []
labels = []

# we can't use the previous tokenizers here
# idx 0 -> neg, 1 -> pos
for idx, cat in enumerate(catgeories):
    path = os.path.join(corpus_root, cat)
    texts = read_text_files(path)

    for i in tqdm(range(len(texts)), desc="prepare_corpus"):
        text = texts[i]
        reviews.append(text)
        labels.append(idx)

print()
print(len(reviews))
print(len(labels))

prepare_corpus: 100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 2555943.94it/s]
prepare_corpus: 100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 2557502.44it/s]


2000
2000





## Tokenizer

https://huggingface.co/nvidia/megatron-bert-cased-345m

Check the instructions on the page on how to download and convert the checkpoints!

In [3]:
from transformers import BertTokenizer
import torch


tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')

404 Client Error: Not Found for url: https://huggingface.co/nvidia/megatron-bert-cased-345m/resolve/main/config.json


## Split corpus


In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    reviews, labels, random_state=42, train_size=0.8
)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

## Dataloaders

In [5]:
import torch
from torch.utils.data import Dataset

# custom dataset
class PolarityReviewDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        # encode review text
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(label)
        }

training_dataset = PolarityReviewDataset(x_train, y_train, tokenizer)
val_dataset = PolarityReviewDataset(x_val, y_val, tokenizer)

In [6]:
from torch.utils.data import DataLoader
batch_size = 8

# loader from custom dataset
train_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)


## Classifier

In [7]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F

class SentiBERT(nn.Module):
    def __init__(self, model_path="/mnt/datadrive/models/nvidia-megatron-bert-cased-345m"):
        super(SentiBERT, self).__init__()

        self.bert = AutoModel.from_pretrained(model_path)
        self.linear = nn.Linear(1024, 1) 
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = out.pooler_output
        
        out = self.linear(out)
        out = self.sigmoid(out)

        return out

In [8]:
senti_bert = SentiBERT()

Some weights of the model checkpoint at /mnt/datadrive/models/nvidia-megatron-bert-cased-345m were not used when initializing MegatronBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing MegatronBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MegatronBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Setup device

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Hyperparams

In [10]:
learning_rate = 2e-5

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(senti_bert.parameters(), lr=learning_rate)

## Send to device

In [11]:
senti_bert = senti_bert.to(device)

## Train

In [12]:
import numpy as np

epochs = 5

def train(model, train_loader, val_loader, epochs, optimizer, loss_fn):
    print_counter = 0 # print loss for each 10th count

    for e in tqdm(range(epochs), desc=f"train_sentibert_for_{epochs}_epochs"):
        model.train()
        for td in train_loader:
            print_counter += 1

            # unpack data
            input_ids = td["input_ids"]
            input_ids = input_ids.to(device)

            attention_mask = td["attention_mask"].to(device)
            attention_mask = attention_mask.to(device)

            label = td["label"]
            label = label.long().to(device)


            # zero gradients
            model.zero_grad()

            # forward pass
            output = model(input_ids, attention_mask)
            # the max probability based class
            output, _ = torch.max(output, dim=1)

            # backprop
            loss = loss_fn(output, label.float())
            loss.backward()
            
            #clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()

            # log loss 
            if print_counter % 10 == 0:
                validation_losses = []
                
                model.eval() # switch mode
                with torch.no_grad():
                    for td in val_loader:
                        # unpack data
                        # unpack data and send to device
                        input_ids = td["input_ids"]
                        input_ids = input_ids.to(device)

                        attention_mask = td["attention_mask"]
                        attention_mask = attention_mask.to(device)

                        label = td["label"]
                        label = label.long().to(device)
                        
                        # repeat same steps from forward pass
                        out = model(input_ids, attention_mask)
                        out, _ = torch.max(out, dim=1)
                        val_loss = loss_fn(out, label.float())
                        
                        # add loss to validation losses
                        validation_losses.append(val_loss.item())
                    print(f"\nEpoch: {e + 1}/{epochs}\tStep: {print_counter}\tTrain Loss: {loss.item()}\tValidation Loss: {np.mean(validation_losses)}")

                model.train()


%time train(model=senti_bert, train_loader=train_loader, val_loader=val_loader, epochs=epochs, optimizer=optimizer, loss_fn=loss_fn)

train_sentibert_for_5_epochs:   0%|                                          | 0/5 [00:00<?, ?it/s]


Epoch: 1/5	Step: 10	Train Loss: 0.6384578943252563	Validation Loss: 0.7654413089156151

Epoch: 1/5	Step: 20	Train Loss: 0.45526474714279175	Validation Loss: 0.4054253540933132

Epoch: 1/5	Step: 30	Train Loss: 0.3957778811454773	Validation Loss: 0.2839594041928649

Epoch: 1/5	Step: 40	Train Loss: 0.7888088226318359	Validation Loss: 0.2250348621979356

Epoch: 1/5	Step: 50	Train Loss: 0.07255827635526657	Validation Loss: 0.22101448248140515

Epoch: 1/5	Step: 60	Train Loss: 0.44878897070884705	Validation Loss: 0.22437809528782965

Epoch: 1/5	Step: 70	Train Loss: 0.02579518035054207	Validation Loss: 0.38056821955833586

Epoch: 1/5	Step: 80	Train Loss: 0.009000816382467747	Validation Loss: 0.30855599413625895

Epoch: 1/5	Step: 90	Train Loss: 0.03439226374030113	Validation Loss: 0.3982370660640299

Epoch: 1/5	Step: 100	Train Loss: 0.08050365746021271	Validation Loss: 0.28318022715393454

Epoch: 1/5	Step: 110	Train Loss: 0.4955618679523468	Validation Loss: 0.5586962974397466

Epoch: 1/5	Step:

train_sentibert_for_5_epochs:  20%|██████▌                          | 1/5 [03:22<13:28, 202.10s/it]


Epoch: 1/5	Step: 160	Train Loss: 0.4266587197780609	Validation Loss: 0.19044684803811834

Epoch: 2/5	Step: 170	Train Loss: 0.0037151332944631577	Validation Loss: 0.3470313615107443

Epoch: 2/5	Step: 180	Train Loss: 0.0017260480672121048	Validation Loss: 0.4416032553242985

Epoch: 2/5	Step: 190	Train Loss: 0.0012961787870153785	Validation Loss: 0.295679357307381

Epoch: 2/5	Step: 200	Train Loss: 0.007474170066416264	Validation Loss: 0.45756524102616825

Epoch: 2/5	Step: 210	Train Loss: 0.0026730888057500124	Validation Loss: 0.27063101968960834

Epoch: 2/5	Step: 220	Train Loss: 0.7478232979774475	Validation Loss: 0.4841305878595449

Epoch: 2/5	Step: 230	Train Loss: 0.7829523086547852	Validation Loss: 0.2591828444652492

Epoch: 2/5	Step: 240	Train Loss: 0.2536102831363678	Validation Loss: 0.3216936098295264

Epoch: 2/5	Step: 250	Train Loss: 0.012366145849227905	Validation Loss: 0.24355697403661908

Epoch: 2/5	Step: 260	Train Loss: 0.004059151746332645	Validation Loss: 0.39275085234548895

train_sentibert_for_5_epochs:  40%|█████████████▏                   | 2/5 [06:45<10:07, 202.58s/it]


Epoch: 2/5	Step: 320	Train Loss: 0.24109143018722534	Validation Loss: 0.29874733040342105

Epoch: 3/5	Step: 330	Train Loss: 0.0036745062097907066	Validation Loss: 0.422539401659742

Epoch: 3/5	Step: 340	Train Loss: 0.030459748581051826	Validation Loss: 0.38027080764586574

Epoch: 3/5	Step: 350	Train Loss: 0.001508947229012847	Validation Loss: 0.4959784849605057

Epoch: 3/5	Step: 360	Train Loss: 0.0018257915508002043	Validation Loss: 0.4975769446929917

Epoch: 3/5	Step: 370	Train Loss: 0.0013415286084637046	Validation Loss: 0.43621154439315435

Epoch: 3/5	Step: 380	Train Loss: 0.0017496357904747128	Validation Loss: 0.5001234336115885

Epoch: 3/5	Step: 390	Train Loss: 0.0014330430421978235	Validation Loss: 0.4047204991846229

Epoch: 3/5	Step: 400	Train Loss: 0.0013201485853642225	Validation Loss: 0.3851472534617642

Epoch: 3/5	Step: 410	Train Loss: 0.7979649305343628	Validation Loss: 0.4358283072928316

Epoch: 3/5	Step: 420	Train Loss: 0.002614053897559643	Validation Loss: 0.36098414018

train_sentibert_for_5_epochs:  60%|███████████████████▊             | 3/5 [10:08<06:46, 203.00s/it]


Epoch: 3/5	Step: 480	Train Loss: 0.0067180306650698185	Validation Loss: 0.3903618025186006

Epoch: 4/5	Step: 490	Train Loss: 0.00332189304754138	Validation Loss: 0.3872531276952941

Epoch: 4/5	Step: 500	Train Loss: 0.0025536047760397196	Validation Loss: 0.6031438451609574

Epoch: 4/5	Step: 510	Train Loss: 0.0009486520430073142	Validation Loss: 0.5643175135090133

Epoch: 4/5	Step: 520	Train Loss: 0.0012630544370040298	Validation Loss: 0.4095821157388855

Epoch: 4/5	Step: 530	Train Loss: 0.0009600179619155824	Validation Loss: 0.4796217361668823

Epoch: 4/5	Step: 540	Train Loss: 0.0007993355393409729	Validation Loss: 0.45021703820239056

Epoch: 4/5	Step: 550	Train Loss: 0.0008667829097248614	Validation Loss: 0.45144744407880355

Epoch: 4/5	Step: 560	Train Loss: 0.0008801810909062624	Validation Loss: 0.4749252903588058

Epoch: 4/5	Step: 570	Train Loss: 0.000495489570312202	Validation Loss: 0.5150844374838925

Epoch: 4/5	Step: 580	Train Loss: 0.00030359491938725114	Validation Loss: 0.59842

train_sentibert_for_5_epochs:  80%|██████████████████████████▍      | 4/5 [13:32<03:23, 203.27s/it]


Epoch: 4/5	Step: 640	Train Loss: 0.0009174280567094684	Validation Loss: 0.4812663455733855

Epoch: 5/5	Step: 650	Train Loss: 0.0004180353425908834	Validation Loss: 0.561079595265619

Epoch: 5/5	Step: 660	Train Loss: 0.0005467314040288329	Validation Loss: 0.5731632010480098

Epoch: 5/5	Step: 670	Train Loss: 0.0007849950343370438	Validation Loss: 0.5681535285330028

Epoch: 5/5	Step: 680	Train Loss: 0.00048179522855207324	Validation Loss: 0.5626493672869401

Epoch: 5/5	Step: 690	Train Loss: 0.00016222888370975852	Validation Loss: 0.5510739298035332

Epoch: 5/5	Step: 700	Train Loss: 0.00039711419958621264	Validation Loss: 0.5364415091473347

Epoch: 5/5	Step: 710	Train Loss: 0.00024905893951654434	Validation Loss: 0.4513539612420573

Epoch: 5/5	Step: 720	Train Loss: 0.0003441920271143317	Validation Loss: 0.5132548491452326

Epoch: 5/5	Step: 730	Train Loss: 0.0002616089186631143	Validation Loss: 0.6571962364894717

Epoch: 5/5	Step: 740	Train Loss: 0.00024290703004226089	Validation Loss: 0.5

train_sentibert_for_5_epochs: 100%|█████████████████████████████████| 5/5 [16:56<00:00, 203.23s/it]


Epoch: 5/5	Step: 800	Train Loss: 0.00028998751076869667	Validation Loss: 0.6104785618183086
CPU times: user 15min 5s, sys: 1min 49s, total: 16min 54s
Wall time: 16min 56s





## Inference

In [13]:
# test data is a list of reviews as strings
def classify_sentiment(model, test_data, tokenizer):
    prediction = []
    # switch model mode
    model.eval()
    with torch.no_grad():

        for i in tqdm(range(len(test_data)), desc="inference"):
            review = test_data[i]
            
            # encode data
            encoded = tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length=512,
                truncation=True,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt"
            )
            
            # unpack
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            
            # forward pass
            pred = model(input_ids, attention_mask)
            pred, _ = torch.max(pred, dim=1)
            # round to the nearest integer
            pred =  torch.round(pred.squeeze())
            
            
            # add to list
            prediction.append(pred.cpu().detach().numpy()) # dear pytorch team, find a easier wrapper please!
            
    return np.array(prediction)

In [14]:
y_pred = classify_sentiment(senti_bert, x_test, tokenizer)

inference: 100%|█████████████████████████████████████████████████| 400/400 [00:10<00:00, 37.46it/s]


In [15]:
y_pred = y_pred.reshape(-1, 1)

In [16]:
y_test = np.array(y_test).reshape(-1, 1)

## Evaluation

In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       199
           1       0.94      0.93      0.93       201

    accuracy                           0.93       400
   macro avg       0.93      0.93      0.93       400
weighted avg       0.93      0.93      0.93       400

