In [10]:
# !pip install datasets
# !pip install transformers
# !pip install --quiet sentencepiece
# !pip install git+https://github.com/huggingface/transformers.git
# !pip install PyPDF2

In [11]:
import pandas as pd
import torch
from transformers import BigBirdTokenizer,BigBirdModel,BigBirdForSequenceClassification
import PyPDF2

In [12]:
train_data = pd.read_parquet('/home/ubuntu/working_directory/Bert_experimentation/80_train.parquet')
val_df = pd.read_parquet('/home/ubuntu/working_directory/Bert_experimentation/80_val.parquet')


In [13]:
def extract_text_from_pdf(file_path):
    text = ''
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [14]:
# Function to split a document into smaller chunks
def split_document(text, chunk_size):
    # Split the text into chunks of size 'chunk_size'
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

In [15]:
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

In [16]:
# Tokenize and encode the text of each document
encoded_data = []
text = extract_text_from_pdf("./15031-4983-FullBook.pdf")
encoded_text = tokenizer(text, padding='max_length', truncation=True, max_length=512)
encoded_data.append({
    'input_ids': encoded_text['input_ids'],
    'attention_mask': encoded_text['attention_mask'],
    'label': "1"  # Replace 'i' with the corresponding index or label for the document
})

In [17]:
len(text)

444130

In [18]:
tokens = tokenizer(text, add_special_tokens=True, truncation=False, return_tensors="pt")
tokens

Token indices sequence length is longer than the specified maximum sequence length for this model (90516 > 4096). Running this sequence through the model will result in indexing errors


{'input_ids': tensor([[   65,  6289, 41085,  ...,   886,   115,    66]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [19]:
from torch import Tensor
def split_overlapping(tensor: Tensor, chunk_size: int, stride: int, minimal_chunk_length: int) -> list[Tensor]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result
example_tensor = tokens["input_ids"][0]
example_tensor

tensor([   65,  6289, 41085,  ...,   886,   115,    66])

In [20]:
def split_tokens_into_smaller_chunks(input_id: Tensor,att_mask: Tensor, chunk_size: int, stride: int, minimal_chunk_length: int):
    input_id_chunks = [input_id[i : i + chunk_size] for i in range(0, len(input_id), stride)]
    mask_chunks = [att_mask[i : i + chunk_size] for i in range(0, len(att_mask), stride)]
    if len(input_id_chunks) > 1:
        # ignore chunks with less than minimal_length number of tokens
        input_id_chunks = [x for x in input_id_chunks if len(x) >= minimal_chunk_length]
        mask_chunks = [x for x in mask_chunks if len(x) >= minimal_chunk_length]
    return input_id_chunks, mask_chunks


def add_special_tokens_at_beginning_and_end(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
    """
    Adds special CLS token (token id = 101) at the beginning.
    Adds SEP token (token id = 102) at the end of each chunk.
    Adds corresponding attention masks equal to 1 (attention mask is boolean).
    """
    for i in range(len(input_id_chunks)):
        # adding CLS (token id 101) and SEP (token id 102) tokens
        input_id_chunks[i] = torch.cat([Tensor([101]), input_id_chunks[i], Tensor([102])])
        # adding attention masks  corresponding to special tokens
        mask_chunks[i] = torch.cat([Tensor([1]), mask_chunks[i], Tensor([1])])

def add_padding_tokens(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> None:
    """Adds padding tokens (token id = 0) at the end to make sure that all chunks have exactly 512 tokens."""
    for i in range(len(input_id_chunks)):
        # get required padding length
        pad_len = 512 - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([input_id_chunks[i], Tensor([0] * pad_len)])
            mask_chunks[i] = torch.cat([mask_chunks[i], Tensor([0] * pad_len)])
def stack_tokens_from_all_chunks(input_id_chunks: list[Tensor], mask_chunks: list[Tensor]) -> tuple[Tensor, Tensor]:
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)
    return input_ids.long(), attention_mask.int()

In [21]:
def transform_single_text(
    text: str,
    tokenizer,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
    maximal_text_length,
) -> tuple[Tensor, Tensor]:
    """Transforms (the entire) text to model input of BERT model."""
    tokens = tokenizer(text, add_special_tokens=True, truncation=False, return_tensors="pt")
    # splitted = split_overlapping(tokens["input_ids"][0], chunk_size=5, stride=5, minimal_chunk_length=5)
    input_id_chunks, mask_chunks = split_tokens_into_smaller_chunks(tokens["input_ids"][0],tokens["attention_mask"][0], chunk_size, stride, minimal_chunk_length)
    add_special_tokens_at_beginning_and_end(input_id_chunks, mask_chunks)
    add_padding_tokens(input_id_chunks, mask_chunks)
    input_ids, attention_mask = stack_tokens_from_all_chunks(input_id_chunks, mask_chunks)
    return input_ids, attention_mask

In [22]:
input_ids, attention_mask = transform_single_text(text, tokenizer, 510, 510, 1, None)

In [24]:
input_ids.shape

torch.Size([178, 512])

In [25]:
#Classify each chunk
# tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base')
predicted_classes = []
for chunk, attention_mask in zip(input_ids, attention_mask):
    outputs = model(chunk, attention_mask=attention_mask)
    logits = outputs.logits
    print("Logits",logits)
    probabilities = logits.softmax(dim=1)
    predicted_class = probabilities.argmax(dim=1).item()
    predicted_classes.append(predicted_class)

print("Predicted Classes:", predicted_classes)


Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

ValueError: not enough values to unpack (expected 2, got 1)

In [26]:
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
model_output = model(input_ids,attention_mask)

Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


ValueError: Wrong shape for input_ids (shape torch.Size([178, 512])) or attention_mask (shape torch.Size([512]))

In [21]:
probs = torch.nn.functional.softmax(model_output[0], dim=-1)
probs

tensor([[[4.1921e-05, 3.7781e-05, 3.1347e-05,  ..., 3.5050e-05,
          3.2223e-05, 3.5450e-05],
         [1.0023e-03, 8.5958e-04, 8.7710e-04,  ..., 8.4208e-04,
          8.0777e-04, 9.0074e-04],
         [9.2169e-04, 9.2377e-04, 8.2463e-04,  ..., 7.4543e-04,
          8.2347e-04, 8.6752e-04],
         ...,
         [1.3720e-03, 1.0795e-03, 1.2611e-03,  ..., 9.4047e-04,
          1.2315e-03, 1.0444e-03],
         [1.5098e-03, 1.0372e-03, 1.2854e-03,  ..., 1.7171e-03,
          1.0804e-03, 9.4912e-04],
         [7.3206e-04, 4.5095e-04, 5.6538e-04,  ..., 4.7553e-04,
          5.8402e-04, 4.2062e-04]],

        [[2.4167e-06, 3.0579e-06, 2.6821e-06,  ..., 2.7302e-06,
          2.1023e-06, 2.1845e-06],
         [1.4922e-03, 1.5710e-03, 1.3152e-03,  ..., 1.0660e-03,
          1.1586e-03, 1.0088e-03],
         [1.0359e-03, 1.2411e-03, 1.1505e-03,  ..., 1.1374e-03,
          1.1750e-03, 8.9946e-04],
         ...,
         [8.0762e-04, 6.6058e-04, 7.5363e-04,  ..., 1.1364e-03,
          9.274

In [22]:
probabilities = probs[:,1]
probabilities

tensor([[0.0010, 0.0009, 0.0009,  ..., 0.0008, 0.0008, 0.0009],
        [0.0015, 0.0016, 0.0013,  ..., 0.0011, 0.0012, 0.0010],
        [0.0011, 0.0012, 0.0010,  ..., 0.0018, 0.0017, 0.0017],
        ...,
        [0.0008, 0.0011, 0.0006,  ..., 0.0012, 0.0006, 0.0008],
        [0.0006, 0.0004, 0.0006,  ..., 0.0007, 0.0006, 0.0003],
        [0.0002, 0.0002, 0.0001,  ..., 0.0002, 0.0001, 0.0001]],
       grad_fn=<SelectBackward0>)

In [23]:
probabilities.mean()

tensor(0.0013, grad_fn=<MeanBackward0>)

In [24]:
probabilities.max()

tensor(0.9998, grad_fn=<MaxBackward1>)

In [None]:

# Load the pre-trained BigBird tokenizer and model
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base')

# Load and preprocess the PDF document
def preprocess_pdf_document(document_path):
    text = ""
    with open(document_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text
document_path = "./15031-4983-FullBook.pdf"
text = preprocess_pdf_document(document_path)

# Tokenize and encode the document
encoding = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=512,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

# Split the document into chunks of 512 tokens
chunk_size = 512
chunks = [input_ids[:, i:i+chunk_size] for i in range(0, input_ids.size(1), chunk_size)]
chunk_attention_masks = [attention_mask[:, i:i+chunk_size] for i in range(0, attention_mask.size(1), chunk_size)]

# Classify each chunk
predicted_classes = []
for chunk, attention_mask in zip(chunks, chunk_attention_masks):
    outputs = model(chunk, attention_mask=attention_mask)
    logits = outputs.logits
    print("Logits",logits)
    probabilities = logits.softmax(dim=1)
    predicted_class = probabilities.argmax(dim=1).item()
    predicted_classes.append(predicted_class)

print("Predicted Classes:", predicted_classes)
