## Bert multi-class

In [1]:
# using Hugging face library. https://github.com/huggingface/transformers
import transformers
import torch
import pandas as pd
from tqdm.notebook import tqdm
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils import data
import numpy as np
from transformers import AdamW # optimizer
from transformers import get_linear_schedule_with_warmup # for learning rate
from transformers import BertModel

### loading pre-trained bert wor embedding for text processing

In [2]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
dataset = pd.read_csv("pre-processed data/new_label_dataset.csv")[['content', 'priority']]
dataset = dataset[dataset['priority'] != 'Unknown'].reset_index(drop=True)

for i in tqdm(range(len(dataset))):
    if dataset.loc[i, 'priority'] == 'Critical': dataset.loc[i, 'priority'] = 0
    elif dataset.loc[i, 'priority'] == 'High': dataset.loc[i, 'priority'] = 1
    elif dataset.loc[i, 'priority'] == 'Medium': dataset.loc[i, 'priority'] = 2
    elif dataset.loc[i, 'priority'] == 'Low': dataset.loc[i, 'priority'] = 3
    else:
        pass
            
dataset[:3]

HBox(children=(FloatProgress(value=0.0, max=37292.0), HTML(value='')))




Unnamed: 0,content,priority
0,philippine flood worsen death toll hit wake ge...,3
1,philippine flood fatality hit,3
2,luzon dam release water flood warn up manila p...,3


In [4]:
# dataset = dataset[:4000]
len(dataset)

37292

### tokenizer.encode_plus()

In [5]:
content = dataset.loc[0, 'content']
encoding = tokenizer.encode_plus(
            content,
            max_length=20,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True,
            return_tensors='pt'
        )

In [6]:
print("index list: ", encoding['input_ids'])
print("attention mask: ", encoding['attention_mask']) # zero is padding.

index list:  tensor([[ 101, 7802, 7186, 4788, 2078, 2331, 9565, 2718, 5256, 4962, 2099,  102,
            0,    0,    0,    0,    0,    0,    0,    0]])
attention mask:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])


### __getitem__:

create iterable object.

In [7]:
class GPRreviewDataset(data.Dataset):
    def __init__(self, content, target, tokenizer, max_len):
        self.content = content
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.content)
    
    def __getitem__(self, item):
        content = str(self.content[item])
        encoding = tokenizer.encode_plus(
            content,
            max_length=self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True,
            return_tensors='pt'
        )
        return {
            'text': content,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(self.target[item], dtype=torch.long)
        }

In [8]:
MAX_LEN = 20
BATCH_SIZE = 64
EPOCHS = 20

df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=2020)

### Create data loader

In [9]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPRreviewDataset(
        content = df.content.to_numpy(),
        target = df.priority.to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len,
    )
    return data.DataLoader(
        ds, 
        batch_size = batch_size,
    )

In [10]:
train_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [11]:
# for df in train_loader:
#     print(df['input_ids'].shape) # batch size, max length
#     print(df['targets'].shape)

In [12]:
class BertClassifier(nn.Module):
    def __init__(self, n_class):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_class)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        output = self.drop(pooled_output)
        output = self.out(output)
        return self.softmax(output)
    

In [13]:
model = BertClassifier(4)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [15]:
total_steps = len(train_loader) * EPOCHS

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

### train and evaluation:

In [16]:
def train(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d['input_ids']
        attention_mask = d['attention_mask']
        targets = d['targets']
        
        output = model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        _, preds = torch.max(output, dim=1)
        loss = loss_fn(output, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double()/n_examples, np.mean(losses)

In [17]:
def val(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids']
            attention_mask = d['attention_mask']
            targets = d['targets']

            output = model(input_ids = input_ids, attention_mask = attention_mask)

            _, preds = torch.max(output, dim=1)
            loss = loss_fn(output, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double()/n_examples, np.mean(losses)

In [18]:
for epoch in tqdm(range(EPOCHS)):
    train_acc, train_loss = train(
        model,
        train_loader,
        loss_fn,
        optimizer,
        None,
        scheduler,
        len(df_train)
    )
    
    val_acc, val_loss = val(
        model,
        test_loader,
        loss_fn,
        None,
        len(df_test)
    )
    print("train loss: ", train_loss, "val loss:", val_loss)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

train loss:  1.022018564079252 val loss: 1.01117525752793



KeyboardInterrupt: 

tutorial: 

https://www.curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

https://www.youtube.com/watch?v=8N-nM3QW7O0

https://www.youtube.com/watch?v=Osj0Z6rwJB4



https://github.com/kaushaltrivedi/fast-bert