In [1]:
# import fitz
from PyPDF2 import PdfReader
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader,Dataset
from collections import Counter


In [2]:
train_data = pd.read_parquet('/home/ubuntu/working_directory/Bert_experimentation/80_train.parquet')
val_df = pd.read_parquet('/home/ubuntu/working_directory/Bert_experimentation/80_val.parquet')


In [3]:
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base', 
                                                         num_labels=3)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data, label2id, tokenizer):
        self.data = data.sample(4000)
        self.label2id = label2id
        self.tokenizer= tokenizer
        self.labels = self.data["labels"].apply(lambda x:self.label2id[x]).to_list()
        self.one_hot_labels = pd.get_dummies(self.labels)
        self.one_hot_labels = torch.tensor(np.array(self.one_hot_labels), dtype=torch.float)
        print("Data:",self.data.labels.value_counts())
        print("counter:",Counter(self.labels))
        # print("@@@@",len(self.data))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.iloc[index]['texts']

        # print("text::",index,text)
        # print('\n')
        label = self.one_hot_labels[index]
        # print("original length:", len(text.split(" ")), len(text))
        encoded_text = self.tokenizer.encode_plus(
            str(text),
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {'input_ids': encoded_text['input_ids'].squeeze(),
                 'attention_mask': encoded_text['attention_mask'].squeeze(),
                 'label':label}   
                 
                 


In [5]:
label2id = {"Level 1": 0, "Level 2": 1, "Level 3": 2}


In [6]:
train_dataset = CustomDataset(train_data,label2id,tokenizer)
test_dataset = CustomDataset(val_df,label2id,tokenizer)

Data: Level 2    1683
Level 1    1248
Level 3    1069
Name: labels, dtype: int64
counter: Counter({1: 1683, 0: 1248, 2: 1069})
Data: Level 2    1389
Level 3    1347
Level 1    1264
Name: labels, dtype: int64
counter: Counter({1: 1389, 2: 1347, 0: 1264})


In [7]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=4,
                                           pin_memory=True,
                                           shuffle=False, 
                                           )
test_loader = torch.utils.data.DataLoader(test_dataset,
                                           batch_size=4,
                                           pin_memory=True,
                                           )

In [8]:

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

EPOCHS = 5
LEARNING_RATE = 0.0000025 ######
BATCH_SIZE = 6
# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.04) #####
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=0.04)

scheduler = get_linear_schedule_with_warmup(optimizer, 
             num_warmup_steps=50, ########
            num_training_steps=len(train_loader)*EPOCHS )   

In [9]:
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")


In [10]:
from torch.nn.utils import clip_grad_norm_
# from tqdm import tqdm
from tqdm.notebook import tqdm

import numpy as np
import math

train_loss_per_epoch = []
val_loss_per_epoch = []

model = model.to(device)
for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)
    '''
    Training
    '''
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(tqdm(train_loader,desc='Training')):
        # tqdm_desc = f'Training ({step_num+1}/{train_loader_length})'
        # print("batch>>>",batch_data)
        input_ids, att_mask, labels = batch_data["input_ids"].to(device),batch_data["attention_mask"].to(device),batch_data["label"].to(device)

        # input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)
        # print("logits***:", output["logits"])
        
        loss = output.loss
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        del loss

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_loss_per_epoch.append(train_loss / (step_num + 1))    


    '''
    Validation
    '''
    model.eval()
    valid_loss = 0
    valid_pred = []
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(tqdm(test_loader,desc='Validation')):
            
            input_ids, att_mask, labels = batch_data["input_ids"].to(device),batch_data["attention_mask"].to(device),batch_data["label"].to(device)
            # input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)
            print("logits***:", output["logits"])

            loss = output.loss
            valid_loss += loss.item()

            valid_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))
        
    val_loss_per_epoch.append(valid_loss / (step_num_e + 1))
    valid_pred = np.concatenate(valid_pred)

    print("{0}/{1} train loss: {2} ".format(step_num+1, math.ceil(len(val_df) / BATCH_SIZE), train_loss / (step_num + 1)))
    print("{0}/{1} val loss: {2} ".format(step_num_e+1, math.ceil(len(val_df) / BATCH_SIZE), valid_loss / (step_num_e + 1)))          


Epoch:  1


Training:   0%|          | 0/1000 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 3; 14.75 GiB total capacity; 1.45 GiB already allocated; 4.81 MiB free; 1.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import matplotlib.pyplot as plt

# Rest of your code...

epochs = range(1, EPOCHS + 1)

plt.figure(figsize=(8, 8))
plt.plot(epochs, train_loss_per_epoch, label='training loss')
plt.plot(epochs, val_loss_per_epoch, label='validation loss')
plt.title("Training and Validation Accuracy")
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Save the plot as a PNG file
plt.savefig('/home/ubuntu/working_directory/Bert_experimentation/loss_plot111.png')


In [None]:
valid_true = [batch["label"].detach().cpu().numpy() for batch in test_loader]
valid_true = np.concatenate(valid_true)
valid_true = np.argmax(valid_true, axis=1)

In [None]:
print(classification_report(valid_pred, valid_true, target_names= ["level 1","level 2","level 3"]))

In [None]:
import pandas as pd
import torch
from transformers import BigBirdTokenizer,BigBirdModel,BigBirdForSequenceClassification
import PyPDF2

In [None]:
def extract_text_from_pdf(file_path):
    text = ''
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [None]:
# Function to split a document into smaller chunks
def split_document(text, chunk_size):
    # Split the text into chunks of size 'chunk_size'
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

In [None]:
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

In [None]:
# Tokenize and encode the text of each document
encoded_data = []
text = extract_text_from_pdf("./15031-4983-FullBook.pdf")
encoded_text = tokenizer(text, padding='max_length', truncation=True, max_length=512)
encoded_data.append({
    'input_ids': encoded_text['input_ids'],
    'attention_mask': encoded_text['attention_mask'],
    'label': "1"  # Replace 'i' with the corresponding index or label for the document
})

In [None]:
tokens = tokenizer(text, add_special_tokens=True, truncation=False, return_tensors="pt")
tokens

In [None]:
from torch import Tensor
def split_overlapping(tensor: Tensor, chunk_size: int, stride: int, minimal_chunk_length: int) -> list[Tensor]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result
example_tensor = tokens["input_ids"][0]
example_tensor

In [None]:
def split_tokens_into_smaller_chunks(input_id: Tensor,att_mask: Tensor, chunk_size: int, stride: int, minimal_chunk_length: int):
    input_id_chunks = [input_id[i : i + chunk_size] for i in range(0, len(input_id), stride)]
    mask_chunks = [att_mask[i : i + chunk_size] for i in range(0, len(att_mask), stride)]
    if len(input_id_chunks) > 1:
        # ignore chunks with less than minimal_length number of tokens
        input_id_chunks = [x for x in input_id_chunks if len(x) >= minimal_chunk_length]
        mask_chunks = [x for x in mask_chunks if len(x) >= minimal_chunk_length]
    return input_id_chunks, mask_chunks


def add_special_tokens_at_beginning_and_end(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
    """
    Adds special CLS token (token id = 101) at the beginning.
    Adds SEP token (token id = 102) at the end of each chunk.
    Adds corresponding attention masks equal to 1 (attention mask is boolean).
    """
    for i in range(len(input_id_chunks)):
        # adding CLS (token id 101) and SEP (token id 102) tokens
        input_id_chunks[i] = torch.cat([Tensor([101]), input_id_chunks[i], Tensor([102])])
        # adding attention masks  corresponding to special tokens
        mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])

def add_padding_tokens(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
    """Adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly 512 tokens."""
    for i in range(len(input_id_chunks)):
        # get required padding length
        pad_len = 512 - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([input_id_chunks[i], Tensor([0] * pad_len)])
            mask_chunks[i] = torch.cat([mask_chunks[i], Tensor([0] * pad_len)])
def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)
    return input_ids.long(), attention_mask.int()