## Load corpus


In [1]:
import os
from tqdm import tqdm
from corpus import download_and_unzip, catgeories, read_text_files, corpus_root

download_and_unzip()

Already downloaded and extracted!


In [2]:
reviews = []
labels = []

# we can't use the previous tokenizers here
# idx 0 -> neg, 1 -> pos
for idx, cat in enumerate(catgeories):
    path = os.path.join(corpus_root, cat)
    texts = read_text_files(path)

    for i in tqdm(range(len(texts)), desc="prepare_corpus"):
        text = texts[i]
        reviews.append(text)
        labels.append(idx)

print()
print(len(reviews))
print(len(labels))

prepare_corpus: 100%|██████████| 1000/1000 [00:00<00:00, 1950839.07it/s]
prepare_corpus: 100%|██████████| 1000/1000 [00:00<00:00, 2468689.82it/s]


2000
2000





## Tokenizer
https://huggingface.co/microsoft/xtremedistil-l6-h384-uncased

In [3]:
from transformers import AutoTokenizer
import torch

model_name = "microsoft/xtremedistil-l6-h384-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name) 

  from .autonotebook import tqdm as notebook_tqdm


## Split corpus


In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    reviews, labels, random_state=42, train_size=0.8
)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

## Dataloaders

In [5]:
import torch
from torch.utils.data import Dataset

# custom dataset
class PolarityReviewDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        # encode review text
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(label)
        }

training_dataset = PolarityReviewDataset(x_train, y_train, tokenizer)
val_dataset = PolarityReviewDataset(x_val, y_val, tokenizer)

In [6]:
from torch.utils.data import DataLoader
batch_size = 32

# loader from custom dataset
train_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)


## Classifier

In [7]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F

class SentiBERT(nn.Module):
    def __init__(self, model_name=model_name):
        super(SentiBERT, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.linear = nn.Linear(384, 2) 
        # self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = out.pooler_output
        
        out = self.linear(out)
        # out = self.softmax(out)

        return out

In [8]:
senti_bert = SentiBERT()

## Setup device

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
device

device(type='cuda', index=0)

## Hyperparams

In [10]:
learning_rate = 2e-5

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(senti_bert.parameters(), lr=learning_rate)

## Send to device

In [11]:
senti_bert = senti_bert.to(device)

## Train

In [12]:
import numpy as np

epochs = 5

def train(model, train_loader, val_loader, epochs, optimizer, loss_fn):
    print_counter = 0 # print loss for each 10th count

    for e in tqdm(range(epochs), desc=f"train_sentibert_for_{epochs}_epochs"):
        model.train()
        for td in train_loader:
            print_counter += 1

            # unpack data
            input_ids = td["input_ids"]
            input_ids = input_ids.to(device)

            attention_mask = td["attention_mask"].to(device)
            attention_mask = attention_mask.to(device)

            label = td["label"]
            label = label.long().to(device)


            # zero gradients
            model.zero_grad()

            # forward pass
            output = model(input_ids, attention_mask)
            # the max probability based class
            # output, _ = torch.max(output, dim=1)

            # backprop
            loss = loss_fn(output, label)
            loss.backward()
            
            #clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()

            # log loss 
            if print_counter % 10 == 0:
                validation_losses = []
                
                model.eval() # switch mode
                with torch.no_grad():
                    for td in val_loader:
                        # unpack data
                        # unpack data and send to device
                        input_ids = td["input_ids"]
                        input_ids = input_ids.to(device)

                        attention_mask = td["attention_mask"]
                        attention_mask = attention_mask.to(device)

                        label = td["label"]
                        label = label.long().to(device)
                        
                        # repeat same steps from forward pass
                        out = model(input_ids, attention_mask)
                        # out, _ = torch.max(out, dim=1)
                        val_loss = loss_fn(out, label)
                        
                        # add loss to validation losses
                        validation_losses.append(val_loss.item())
                    print(f"\nEpoch: {e + 1}/{epochs}\tStep: {print_counter}\tTrain Loss: {loss.item()}\tValidation Loss: {np.mean(validation_losses)}")

                model.train()


train(model=senti_bert, train_loader=train_loader, val_loader=val_loader, epochs=epochs, optimizer=optimizer, loss_fn=loss_fn)

train_sentibert_for_5_epochs:   0%|          | 0/5 [00:00<?, ?it/s]


Epoch: 1/5	Step: 10	Train Loss: 0.6805471181869507	Validation Loss: 0.6869428873062133

Epoch: 1/5	Step: 20	Train Loss: 0.6732557415962219	Validation Loss: 0.6442213535308838

Epoch: 1/5	Step: 30	Train Loss: 0.5607035160064697	Validation Loss: 0.5725669503211975


train_sentibert_for_5_epochs:  20%|██        | 1/5 [00:14<00:57, 14.48s/it]


Epoch: 1/5	Step: 40	Train Loss: 0.4713623523712158	Validation Loss: 0.5134229212999344

Epoch: 2/5	Step: 50	Train Loss: 0.34507638216018677	Validation Loss: 0.4737884044647217

Epoch: 2/5	Step: 60	Train Loss: 0.44302549958229065	Validation Loss: 0.44965456426143646

Epoch: 2/5	Step: 70	Train Loss: 0.45069530606269836	Validation Loss: 0.4235058814287186


train_sentibert_for_5_epochs:  40%|████      | 2/5 [00:28<00:42, 14.19s/it]


Epoch: 2/5	Step: 80	Train Loss: 0.22579744458198547	Validation Loss: 0.4012142479419708

Epoch: 3/5	Step: 90	Train Loss: 0.36880138516426086	Validation Loss: 0.3908797696232796

Epoch: 3/5	Step: 100	Train Loss: 0.33508363366127014	Validation Loss: 0.3885417491197586

Epoch: 3/5	Step: 110	Train Loss: 0.2599605321884155	Validation Loss: 0.3921687662601471


train_sentibert_for_5_epochs:  60%|██████    | 3/5 [00:42<00:28, 14.08s/it]


Epoch: 3/5	Step: 120	Train Loss: 0.18885493278503418	Validation Loss: 0.4060101956129074

Epoch: 4/5	Step: 130	Train Loss: 0.3238339424133301	Validation Loss: 0.4041069567203522

Epoch: 4/5	Step: 140	Train Loss: 0.416932612657547	Validation Loss: 0.39546954780817034

Epoch: 4/5	Step: 150	Train Loss: 0.23466792702674866	Validation Loss: 0.37453789114952085


train_sentibert_for_5_epochs:  80%|████████  | 4/5 [00:56<00:14, 14.02s/it]


Epoch: 4/5	Step: 160	Train Loss: 0.17803679406642914	Validation Loss: 0.37731116116046903

Epoch: 5/5	Step: 170	Train Loss: 0.26318785548210144	Validation Loss: 0.3693386033177376

Epoch: 5/5	Step: 180	Train Loss: 0.11945859342813492	Validation Loss: 0.36980053782463074

Epoch: 5/5	Step: 190	Train Loss: 0.19885681569576263	Validation Loss: 0.36510064750909804


train_sentibert_for_5_epochs: 100%|██████████| 5/5 [01:10<00:00, 14.04s/it]


Epoch: 5/5	Step: 200	Train Loss: 0.20471414923667908	Validation Loss: 0.40471142828464507





## Inference

In [15]:
# test data is a list of reviews as strings
def classify_sentiment(model, test_data, tokenizer):
    prediction = []
    # switch model mode
    model.eval()
    with torch.no_grad():

        for i in tqdm(range(len(test_data)), desc="inference"):
            review = test_data[i]
            
            # encode data
            encoded = tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length=512,
                truncation=True,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt"
            )
            
            # unpack
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            
            # forward pass
            pred = model(input_ids, attention_mask)
            _, pred = torch.max(pred, dim=-1)
            # round to the nearest integer
            # pred =  torch.round(pred.squeeze())
            
            
            # add to list
            prediction.append(pred.cpu().detach().numpy()) # dear pytorch team, find a easier wrapper please!
            
    return np.array(prediction)

In [16]:
y_pred = classify_sentiment(senti_bert, x_test, tokenizer)

inference: 100%|██████████| 400/400 [00:01<00:00, 219.29it/s]


In [17]:
y_pred = y_pred.reshape(-1, 1)

In [18]:
y_test = np.array(y_test).reshape(-1, 1)

## Evaluation

In [19]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       199
           1       0.84      0.85      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

