# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertForSequenceClassification
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

import pytorch_lightning as pl

import warnings
warnings.simplefilter('ignore')



# Config

In [2]:
tokenizer_path = 'cointegrated/rubert-tiny'
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
model_path = 'cointegrated/rubert-tiny'
model = BertForSequenceClassification.from_pretrained(model_path)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

In [3]:
out_features = model.bert.encoder.layer[1].output.dense.out_features
out_features

312

In [55]:
class Config:
    train_path = 'train.csv'
    test_path = 'test.csv'
    lr = 1e-5
    max_len = 512
    train_bs = 64
    valid_bs = 32
    train_pcent = 0.99
    num_workers = 8
    bert_model = model#'bert-base-uncased'
    tokenizer = tokenizer#transformers.BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

In [56]:
class AgroData(Dataset):
    def __init__(self, review, target):
        self.review = review
        self.encoder = {i: k for k, i in enumerate(np.unique(target))}
        self.target = [self.encoder[j] for j in target]
        self.tokenizer = Config.tokenizer
        self.max_len = Config.max_len

    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, idx):
        text = str(self.review[idx])
        target = self.target[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [57]:
train_ds = pd.read_csv('train.csv')
val_ds = pd.read_csv('val.csv')
test_ds = pd.read_csv('test.csv')

In [58]:
ds = AgroData(train_ds['merged'], train_ds['reference_id'])
train_loader = DataLoader(ds, batch_size=2, shuffle=True)

In [59]:
ds_train = AgroData(train_ds['merged'], train_ds['reference_id'])
ds_val = AgroData(val_ds['merged'], val_ds['reference_id'])
ds_test = AgroData(test_ds['merged'], test_ds['reference_id'])

In [60]:
dl_train = DataLoader(ds_train, batch_size=2, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=2, shuffle=False)
dl_test = DataLoader(ds_test, batch_size=2, shuffle=False)

In [74]:
class BERTModel(pl.LightningModule):
    def __init__(self) -> None:
        super(BERTModel, self).__init__()
        self.bert = model
        self.bert.classifier = nn.Linear(312, 471)
        self.all_targets = []
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, ids, mask) -> torch.Tensor:
        output = self.bert(ids, attention_mask=mask)
        return output
    
    def training_step(self, batch, batch_idx):
        ids = batch['input_ids']
        mask = batch['attention_mask']
        targets = batch['targets']

        outputs = self(ids=ids, mask=mask)

        train_loss = self.loss_fn(outputs.logits, targets)
        return {'loss': train_loss}
    
    def validation_step(self, batch, batch_idx):
        ids = batch['input_ids']
        mask = batch['attention_mask']
        targets = batch['targets']

        outputs = self(ids=ids, mask=mask)
        
        valid_loss = self.loss_fn(outputs.logits, targets)
        return {'val_loss': valid_loss}
    
    def configure_optimizers(self):
        optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
        return optimizer
    
    def predict(self, text):
        encoding = Config.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        out = {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)
        
        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
        
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

In [71]:
modelN = BERTModel()
trainer = pl.Trainer(max_epochs=1)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores


In [72]:
trainer.fit(modelN, dl_train, dl_val)


  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert    | BertForSequenceClassification | 11.9 M
1 | loss_fn | CrossEntropyLoss              | 0     
----------------------------------------------------------
11.9 M    Trainable params
0         Non-trainable params
11.9 M    Total params
47.726    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 1114/1114 [17:29<00:00,  1.06it/s, loss=6, v_num=19]  


1

In [80]:
def predict(model, text):
        encoding = Config.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=Config.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        out = {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        input_ids = out["input_ids"].to(model.device)
        attention_mask = out["attention_mask"].to(model.device)
        
        outputs = model.bert(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
        
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

In [None]:
from tqdm import tqdm

In [83]:
correct = 0
all = len(dl_test)

for el in tqdm(range(test_ds.shape[0])):
    correct = ds_test.encoder[test_ds['reference_id'][el]]
    prediction = predict(modelN, test_ds['merged'][el])

    correct += (prediction==correct)

print(f"Accuracy: {correct/all * 100}%")


  0%|          | 0/554 [00:00<?, ?it/s]

Accuracy: 67.14801444043322%


In [84]:
torch.save(modelN.state_dict(), "bert1_0.pth")