In [4]:
! pip install pypdf
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [5]:
from pypdf import PdfReader
import nltk
import torch
from transformers import BertTokenizer, BertModel
import re
import uuid
import torch.nn as nn
from sentence_transformers import SentenceTransformer


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
# prompt: mount the drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:

file_path = '/content/drive/MyDrive/FYP/data/uml-annual-report-2020-2021.pdf' # Update with your actual file path


## Sentence Class

In [30]:
# sentence class
class Sentence:
  def __init__(self,id,text,embeddings):
    self.id = id
    self.text = text
    self.embeddings = embeddings

  def get_id(self):
    return self.id

  def get_text(self):
    return self.text

  def get_embedding(self):
    return self.embeddings

## Preprocessing

In [11]:

# Function to remove email addresses from text
def remove_emails(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    return re.sub(email_pattern, '', text)


# Function to remove telephone numbers from text
def remove_phone_numbers(text):
    phone_pattern = r'\(?\b[0-9]{3}[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b'
    return re.sub(phone_pattern, '', text)


# Function to split text into sentences while avoiding certain splits
def split_into_sentences(text):
    # Replace newline characters with spaces
    clean_text = text.replace("\n", " ")

    # Further clean up, e.g., stripping extra spaces
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    # Use NLTK's sentence tokenizer for more robust splitting
    sentences = nltk.tokenize.sent_tokenize(clean_text)

    return sentences

def get_all_sentences_array(file_path):


    # Initialize the PdfReader object to read the PDF file from the given file path
    reader = PdfReader(file_path)

    # Create an empty list to store all extracted sentences
    all_sentences = []

    # Get the total number of pages in the PDF document
    number_of_pages = len(reader.pages)
    print(f"Total Number of Pages: {number_of_pages}")

    # Loop through all the pages in the PDF (from the first page to the last)
    for page_num in range(number_of_pages):
        # Get the current page
        page = reader.pages[page_num]

        # Extract the text content from the current page
        text = page.extract_text()
        # print(f"Processing Page {page_num + 1}...")

        # Remove any emails and phone numbers from the text for privacy or cleanup
        text = remove_emails(text)
        text = remove_phone_numbers(text)

        # Split the cleaned text into individual sentences
        sentences = split_into_sentences(text)

        # Append each sentence along with its corresponding page number to the list
        all_sentences.extend([(sentence, page_num + 1) for sentence in sentences])

    # Return the final list of all sentences along with their page numbers
    return all_sentences



## Embeddings

In [12]:
class DocumentLevelPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sentences=5000):
        """
        Document-Level Positional Encoding.
        d_model: dimension of the BERT embeddings (usually 768 for base BERT)
        max_sentences: maximum number of sentences in a document
        """
        super(DocumentLevelPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.pe = self._create_positional_encoding(max_sentences, d_model)

    def _create_positional_encoding(self, max_sentences, d_model):
        """
        Helper function to create the positional encoding matrix.
        """
        position = torch.arange(0, max_sentences, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe = torch.zeros(max_sentences, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # Add batch dimension

    def forward(self, sentence_position):
        """
        sentence_position: Index of the sentence in the document
        """
        return self.pe[:, sentence_position, :]

In [13]:
def load_bert_model_and_tokenizer():
    """
    Loads pre-trained BERT model and tokenizer.
    Returns both model and tokenizer.
    """
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    return tokenizer, bert_model

def tokenize_sentences(sentences, tokenizer):
    """
    Tokenizes sentences and prepares inputs for BERT.
    Returns a list of tokenized inputs.
    """
    return [tokenizer(sentence, return_tensors='pt', padding=True, truncation=True) for sentence in sentences]

def add_positional_encoding_to_embeddings(bert_model, doc_pos_encoder, inputs):
    """
    Processes sentences by getting BERT embeddings, adding document-level positional encoding.
    Returns sentence embeddings with and without positional encoding.
    """
    sentence_embeddings_without_pos_encoding = []
    sentence_embeddings_with_pos_encoding = []

    for idx, input_dict in enumerate(inputs):
        # Pass the sentence input through BERT to get token embeddings, including attention_mask
        token_embeddings = bert_model(input_dict['input_ids'], attention_mask=input_dict['attention_mask']).last_hidden_state

        # Extract sentence embedding without positional encoding (using [CLS] token)
        sentence_embeddings_without_pos_encoding.append(token_embeddings[:, 0, :])

        # Add document-level positional encoding
        doc_pos_encoding = doc_pos_encoder(idx).unsqueeze(1)  # Add sequence dimension
        modified_embeddings = token_embeddings + doc_pos_encoding

        # Aggregate sentence embedding (e.g., taking the [CLS] token's embedding)
        cls_embedding = modified_embeddings[:, 0, :]  # Extract the [CLS] token representation
        sentence_embeddings_with_pos_encoding.append(cls_embedding)

    return sentence_embeddings_without_pos_encoding, sentence_embeddings_with_pos_encoding


def process_sentences_with_positional_encoding(sentences):
    """
    Main function to process an array of sentences by:
    - Loading BERT model and tokenizer
    - Tokenizing sentences
    - Applying document-level positional encoding
    - Returning sentence embeddings with and without positional encoding

    Parameters:
    sentences: List of sentences to process.

    Returns:
    Tuple containing:
      - sentence_embeddings_without_pos_encoding
      - sentence_embeddings_with_pos_encoding
    """
    # Load BERT model and tokenizer
    tokenizer, bert_model = load_bert_model_and_tokenizer()

    # Initialize document-level positional encoder
    doc_pos_encoder = DocumentLevelPositionalEncoding(d_model=768)

    # Tokenize sentences
    inputs = tokenize_sentences(sentences, tokenizer)

    # Process sentences, adding document-level positional encoding
    sentence_embeddings_without_pos_encoding, sentence_embeddings_with_pos_encoding = add_positional_encoding_to_embeddings(
        bert_model, doc_pos_encoder, inputs
    )

    # Create Sentence objects for each sentence
    sentence_objects = []
    for idx, sentence in enumerate(sentences):
        # Create Sentence object with sentence ID, text, and positional encoded embeddings
        sentence_obj = Sentence(id=idx, sentence=sentence, embeddings=sentence_embeddings_with_pos_encoding[idx])
        sentence_objects.append(sentence_obj)

    return sentence_objects

In [34]:
def process_sentences_with_positional_encoding_updated(sentences):
    """
    Main function to process an array of sentences by:
    - Loading SentenceTransformer model
    - Tokenizing sentences
    - Applying document-level positional encoding
    - Returning sentence embeddings with positional encoding
    """
    # Load SentenceTransformer model
    model = SentenceTransformer('sentence-transformers/xlm-r-bert-base-nli-mean-tokens')

    # Extract sentences from tuples
    sentence_texts = [sentence for sentence in sentences]

    # Generate sentence embeddings
    sentence_embeddings = model.encode(sentence_texts, convert_to_tensor=True)

    # Initialize document-level positional encoder
    doc_pos_encoder = DocumentLevelPositionalEncoding(d_model=sentence_embeddings.shape[1])

    # Apply positional encoding
    sentence_embeddings_with_pos = []

    # Apply positional encoding and create Sentence objects
    sentence_objects = []

    for idx, sentence_embedding in enumerate(sentence_embeddings):
        pos_encoding = doc_pos_encoder(idx).squeeze(0)  # Get positional encoding for current sentence
        modified_embedding = sentence_embedding + pos_encoding

        sentence_embeddings_with_pos.append(modified_embedding)

        # Create Sentence object with sentence ID, text, and positional encoded embeddings
        sentence_obj = Sentence(id=idx, text=sentence_texts[idx], embeddings=modified_embedding)
        sentence_objects.append(sentence_obj)

    return sentence_objects

In [15]:
# Load the dataset
sentences_with_page = get_all_sentences_array(file_path)

Total Number of Pages: 208


In [16]:
for i in range(0,10):
  print(sentences_with_page[i])

('ANNUAL REPORT 202 0/21', 1)
('While the pandemic held the brakes for the economy, bringing about a new normal in which everyone had to adapt or fail, we were thrust into the same dilemma.', 3)
('As many external forces threatened to impede our progress, our forward thinking plans, envisioned even before this massive change, were imperative to see us through to the other side.', 3)
('Helping us excel were also our people, whose attitude and can-do spirit helped propel us forward, maintaining our progress to achieve what many could not.', 3)
('In a year that tested our resilience and our ability to turn challenge into opportunity, a pivotal part of our success was a matter of being united.', 3)
('Table of Contents OVERVIEW OF THE BUSINESSMANAGEMENT REVIEWGOVERNANCE About us 04 Group Structure 05 Milestones 06 Operational and Financial Highlights 08 Chairman’s Message 12 Group CEO’s Review of Operations 15 Board of Directors 18 Senior Management Team 24Management Discussion and Analysis

In [17]:
# Extract only the sentence part (ignoring the page number)
sentences = [sentence_tuple[0] for sentence_tuple in sentences_with_page]


In [35]:
sentence_objects = process_sentences_with_positional_encoding_updated(sentences)

print(f"Processed {len(sentence_objects)} sentences.")

Processed 2443 sentences.


In [36]:
for i in range(4):
  print(f"Sentence : {sentence_objects[i].get_text()}")
  print(f"Embedding : {sentence_objects[i].get_embedding()}")
  print(f"ID : {sentence_objects[i].get_id()}")
  print("\n\n")

Sentence : ANNUAL REPORT 202 0/21
Embedding : tensor([-3.3467e-01,  5.4390e-01,  1.6948e+00,  8.8355e-01,  7.4920e-01,
         1.9898e-01, -2.8941e-01,  1.5197e+00, -1.0282e+00,  8.2999e-01,
        -2.3570e-01,  6.8486e-01,  3.8455e-01,  1.1878e+00,  6.8374e-01,
         1.2116e+00, -2.8753e-01,  1.4464e+00,  3.6412e-01,  4.1308e-01,
        -3.1625e-01,  1.7244e+00,  1.9737e-01,  2.6347e-01, -7.4439e-01,
         3.3599e-01,  3.6404e-01, -9.6764e-01, -5.2974e-01,  1.2720e+00,
        -4.6453e-01,  7.7728e-01,  1.4525e-01,  1.3351e+00,  1.0061e+00,
         1.2103e+00, -4.3093e-01,  1.2309e+00,  2.3743e-02,  3.1738e-01,
         1.0152e+00,  3.0563e-01,  2.4389e-01,  6.3767e-01, -9.6686e-01,
         1.9155e-01,  1.6582e-01,  2.2959e-01,  1.8157e-01, -7.2331e-01,
        -7.0406e-01,  1.5096e+00, -4.9888e-02,  1.1706e+00, -1.0758e+00,
         1.5432e+00,  7.7650e-01,  8.1784e-01, -1.0909e-01,  1.7212e+00,
         8.5285e-01,  5.2219e-01, -7.4162e-01,  2.0615e+00, -2.5066e-01,
     