In [None]:
# 1. Environment Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from transformers import BertTokenizer, BertModel
from transformers import logging as transformers_logging
transformers_logging.set_verbosity_error()  # Suppress warnings from transformers

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.nn import Parameter
import pandas as pd
import numpy as np
import string
from collections import Counter
from tqdm import tqdm

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

The TextPreprocessor class is crafted to transform raw textual data into a format that is both clean and compatible with advanced NLP models like BERT.

Each method within the class serves a distinct purpose, collectively ensuring that the data is well-prepared for effective model training, validation, and testing.

In [None]:
# 2. Data Preprocessing
class TextPreprocessor:
    def __init__(self, pretrained_model_name='bert-base-uncased', max_len=50):
        """
        Initializes the TextPreprocessor with BERT's tokenizer.

        Args:
            pretrained_model_name (str): Name of the pre-trained BERT model.
            max_len (int): Maximum length for padding/truncating sequences.
        """
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.max_len = max_len
        self.label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}

    def clean_text(self, text):
        """
        Cleans the input text by removing HTML tags and punctuation.
        Args: text (str): The raw text data to clean.
        Returns: str: The cleaned text.
        """
        text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML tags
        text = text.lower() # Convert to lowercase
        text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
        return text

    def split_data(self, df, text_column, label_column, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2):
        """
        Splits the dataset into train, validation, and test sets.

        Args:
            df (pd.DataFrame): The input DataFrame with text and labels.
            text_column (str): The name of the column containing text data.
            label_column (str): The name of the column containing label data.
            train_ratio (float): Proportion of the dataset for training.
            val_ratio (float): Proportion of the dataset for validation.
            test_ratio (float): Proportion of the dataset for testing.

        Returns:
            tuple: Train, validation, and test DataFrames.
        """
        assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1."

        # Split into train and temp (validation + test)
        train_data, temp_data = train_test_split(df, test_size=val_ratio + test_ratio, random_state=42, stratify=df[label_column])
        # Split temp into validation and test
        val_data, test_data = train_test_split(temp_data, test_size=test_ratio / (val_ratio + test_ratio), random_state=42, stratify=temp_data[label_column])
        return train_data, val_data, test_data

    def encode_labels(self, df, label_column):
        """
        Encodes string labels to numeric values.

        Args:
            df (pd.DataFrame): Input DataFrame with string labels.
            label_column (str): Column containing the labels.

        Returns:
            pd.DataFrame: DataFrame with numeric labels.
        """
        df[label_column] = df[label_column].map(self.label_mapping)
        return df

    def tokenize_and_encode(self, texts):
        """
        Tokenizes and encodes the texts using BERT's tokenizer.

        Args:
            texts (list of str): List of text samples.

        Returns:
            dict: Dictionary containing input_ids and attention_mask.
        """
        return self.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

    def preprocess_dataset(self, df, text_column, label_column):
        """
        Cleans text data and encodes labels.

        Args:
            df (pd.DataFrame): Input DataFrame with text and labels.
            text_column (str): Column with text data.
            label_column (str): Column with labels.

        Returns:
            tuple: Train, validation, and test datasets with input_ids, attention_mask, and labels.
        """
        print("Cleaning text data...")
        df[text_column] = df[text_column].apply(self.clean_text)

        print("Encoding labels...")
        df = self.encode_labels(df, label_column)

        print("Splitting dataset into train, validation, and test sets...")
        train_data, val_data, test_data = self.split_data(df, text_column, label_column)

        print("Tokenizing and encoding text data...")
        train_encodings = self.tokenize_and_encode(train_data[text_column].tolist())
        val_encodings = self.tokenize_and_encode(val_data[text_column].tolist())
        test_encodings = self.tokenize_and_encode(test_data[text_column].tolist())

        train_labels = torch.tensor(train_data[label_column].values)
        val_labels = torch.tensor(val_data[label_column].values)
        test_labels = torch.tensor(test_data[label_column].values)

        print(f"Train set: {len(train_data)} samples")
        print(f"Validation set: {len(val_data)} samples")
        print(f"Test set: {len(test_data)} samples")

        return (train_encodings['input_ids'], train_encodings['attention_mask'], train_labels,
                val_encodings['input_ids'], val_encodings['attention_mask'], val_labels,
                test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

**How To Call the method**

preprocessor = TextPreprocessor()
train_sequences, train_labels, val_sequences, val_labels, test_sequences, test_labels = preprocessor.preprocess_dataset(
    org_data, text_column='Sentence', label_column='Sentiment', max_len=50
)

print(f"\nVocabulary size: {len(preprocessor.vocab)}")
print(f"First 5 training sequences: {train_sequences[:5]}")
print(f"First 5 training labels: {train_labels[:5]}")


In [None]:
# 3. Quaternion Classes

class Quaternion:
    """
    A class to represent and operate on quaternions.
    Supports initialization, Hamilton product, addition, normalization, and conjugation.
    """

    def __init__(self, tensor):
        """
        Initialize with a tensor that will be split into quaternion components.
        The tensor's last dimension is assumed to represent concatenated quaternion components
        (r, i, j, k), with its size divisible by 4.

        Args:
            tensor (torch.Tensor): A tensor with quaternion components concatenated along the last dimension.
        """
        self.tensor = tensor
        self.r, self.i, self.j, self.k = torch.split(self.tensor, self.tensor.shape[-1] // 4, dim=-1)

    @classmethod
    def from_components(cls, r, i, j, k):
        """
        Create a Quaternion object from separate r, i, j, k component tensors.

        Args:
            r (torch.Tensor): Real component.
            i (torch.Tensor): First imaginary component.
            j (torch.Tensor): Second imaginary component.
            k (torch.Tensor): Third imaginary component.

        Returns:
            Quaternion: A quaternion object created from the input components.
        """
        tensor = torch.cat([r, i, j, k], dim=-1)
        return cls(tensor)

    def hamilton_product(self, other):
        """
        Perform Hamilton product (quaternion multiplication) between this quaternion
        and another quaternion using either element-wise or matrix multiplication.

        Args:
            other (Quaternion): Another quaternion.

        Returns:
            torch.Tensor: A tensor representing the resulting quaternion components (r, i, j, k).
        """
        r2, i2, j2, k2 = other.r, other.i, other.j, other.k

        if self.r.dim() == 2 and r2.dim() == 2 and self.r.shape[-1] == r2.shape[-2]:
            # Matrix multiplication
            r = torch.matmul(self.r, r2) - torch.matmul(self.i, i2) - torch.matmul(self.j, j2) - torch.matmul(self.k, k2)
            i = torch.matmul(self.r, i2) + torch.matmul(self.i, r2) + torch.matmul(self.j, k2) - torch.matmul(self.k, j2)
            j = torch.matmul(self.r, j2) - torch.matmul(self.i, k2) + torch.matmul(self.j, r2) + torch.matmul(self.k, i2)
            k = torch.matmul(self.r, k2) + torch.matmul(self.i, j2) - torch.matmul(self.j, i2) + torch.matmul(self.k, r2)
        else:
            # Element-wise multiplication
            r = self.r * r2 - self.i * i2 - self.j * j2 - self.k * k2
            i = self.r * i2 + self.i * r2 + self.j * k2 - self.k * j2
            j = self.r * j2 - self.i * k2 + self.j * r2 + self.k * i2
            k = self.r * k2 + self.i * j2 - self.j * i2 + self.k * r2

        return Quaternion(torch.cat([r, i, j, k], dim=-1)).as_tensor()

    def hamilton_product_quaternion(self, other):
        """
        Perform Hamilton product and return the result as a Quaternion object.

        Args:
            other (Quaternion): Another quaternion.

        Returns:
            Quaternion: A new quaternion representing the product.
        """
        r2, i2, j2, k2 = other.r, other.i, other.j, other.k

        if self.r.dim() == 2 and r2.dim() == 2 and self.r.shape[-1] == r2.shape[-2]:
            r = torch.matmul(self.r, r2) - torch.matmul(self.i, i2) - torch.matmul(self.j, j2) - torch.matmul(self.k, k2)
            i = torch.matmul(self.r, i2) + torch.matmul(self.i, r2) + torch.matmul(self.j, k2) - torch.matmul(self.k, j2)
            j = torch.matmul(self.r, j2) - torch.matmul(self.i, k2) + torch.matmul(self.j, r2) + torch.matmul(self.k, i2)
            k = torch.matmul(self.r, k2) + torch.matmul(self.i, j2) - torch.matmul(self.j, i2) + torch.matmul(self.k, r2)
        else:
            r = self.r * r2 - self.i * i2 - self.j * j2 - self.k * k2
            i = self.r * i2 + self.i * r2 + self.j * k2 - self.k * j2
            j = self.r * j2 - self.i * k2 + self.j * r2 + self.k * i2
            k = self.r * k2 + self.i * j2 - self.j * i2 + self.k * r2

        return Quaternion(torch.cat([r, i, j, k], dim=-1))

    def add(self, other):
        """
        Perform component-wise addition between two quaternions.

        Args:
            other (Quaternion): Another quaternion.

        Returns:
            Quaternion: A quaternion representing the sum.
        """
        r = self.r + other.r
        i = self.i + other.i
        j = self.j + other.j
        k = self.k + other.k

        return Quaternion(torch.cat([r, i, j, k], dim=-1))

    def normalize(self):
        """
        Normalize the quaternion to have unit norm.

        Returns:
            Quaternion: A normalized quaternion.
        """
        norm = torch.sqrt(self.r ** 2 + self.i ** 2 + self.j ** 2 + self.k ** 2)
        return Quaternion(torch.cat([self.r / norm, self.i / norm, self.j / norm, self.k / norm], dim=-1))

    def conjugate(self):
        """
        Return the conjugate of the quaternion: (r, -i, -j, -k).

        Returns:
            Quaternion: The conjugated quaternion.
        """
        return Quaternion(torch.cat([self.r, -self.i, -self.j, -self.k], dim=-1))

    def as_tensor(self):
        """
        Return the quaternion as a single concatenated tensor (r, i, j, k components).
        If the tensor is 2D, unsqueeze it to add a batch dimension of 1.

        Returns:
            torch.Tensor: The concatenated quaternion tensor.
        """
        tensor = torch.cat([self.r, self.i, self.j, self.k], dim=-1)
        if tensor.ndim == 2:
            tensor = tensor.unsqueeze(0)
        return tensor


class QuaternionTransformation(nn.Module):
    """
    A PyTorch module for applying a quaternion transformation to input data using
    learnable quaternion weights. Supports optional activation functions.
    """

    def __init__(self, input_dim, output_dim, activation=None, init=None):
        """
        Initialize the QuaternionTransformation module.

        Args:
            input_dim (int): Dimensionality of the input tensor (must be divisible by 4).
            output_dim (int): Desired output dimensionality (number of quaternion features).
            activation (callable, optional): Activation function to apply after transformation.
            init (callable, optional): Initialization function for quaternion weights. Defaults to Xavier uniform.
        """
        super(QuaternionTransformation, self).__init__()

        # Quaternion input dimension is divided by 4 (for r, i, j, k components)
        self.input_dim = input_dim // 4
        self.output_dim = output_dim
        self.activation = activation

        # Learnable quaternion weight matrices for r, i, j, k components
        self.r_weight = Parameter(torch.Tensor(self.input_dim, self.output_dim))
        self.i_weight = Parameter(torch.Tensor(self.input_dim, self.output_dim))
        self.j_weight = Parameter(torch.Tensor(self.input_dim, self.output_dim))
        self.k_weight = Parameter(torch.Tensor(self.input_dim, self.output_dim))

        # Initialize the weights
        if init is None:
            nn.init.xavier_uniform_(self.r_weight)
            nn.init.xavier_uniform_(self.i_weight)
            nn.init.xavier_uniform_(self.j_weight)
            nn.init.xavier_uniform_(self.k_weight)
        else:
            # Apply custom initialization if provided
            self.r_weight.data = init(self.r_weight.data)
            self.i_weight.data = init(self.i_weight.data)
            self.j_weight.data = init(self.j_weight.data)
            self.k_weight.data = init(self.k_weight.data)

    def forward(self, x):
        """
        Perform the quaternion transformation on the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, seq_len, input_dim] (3D)
                              or [batch_size, input_dim] (2D). The last dimension must
                              be divisible by 4 for quaternion processing.

        Returns:
            torch.Tensor: Output tensor of shape [batch_size, seq_len, output_dim * 4] (for 3D input)
                          or [batch_size, output_dim * 4] (for 2D input), representing
                          the transformed quaternion components (r, i, j, k).
        """
        is_3d_input = x.ndim == 3  # Check if the input is 3D

        if is_3d_input:
            # Flatten 3D input to 2D for processing
            batch_size, seq_len, input_dim = x.shape
            x = x.view(batch_size * seq_len, input_dim)

        # Convert input tensor into quaternion components (r, i, j, k)
        q_x = Quaternion(x)

        # Create a quaternion from the learnable weights
        q_kernel = Quaternion.from_components(self.r_weight, self.i_weight, self.j_weight, self.k_weight)

        # Perform Hamilton product (quaternion multiplication)
        hamilton_product_result = q_x.hamilton_product(q_kernel)

        # Apply activation function if provided
        if self.activation is not None:
            output = self.activation(hamilton_product_result)
        else:
            output = hamilton_product_result

        if is_3d_input:
            # Reshape back to 3D if input was originally 3D
            return output.view(batch_size, seq_len, self.output_dim * 4)

        return output

class QuaternionSelfAttention(nn.Module):
    """
    A PyTorch implementation of Quaternion Self-Attention for handling quaternion-valued data.
    This module applies quaternion-valued self-attention mechanisms using Hamilton product-based interactions.

    Attributes:
        input_dim (int): The dimension of the input tensor.
        output_dim (int): The dimension of the output tensor.
        dropout_rate (float): The dropout rate applied to the attention weights.
        dk (int): Scaling factor for attention, equal to output_dim divided by 4.
        q_transform (QuaternionTransformation): Transformation layer for queries (Q).
        k_transform (QuaternionTransformation): Transformation layer for keys (K).
        v_transform (QuaternionTransformation): Transformation layer for values (V).
    """
    def __init__(self, input_dim, output_dim, dropout_rate=0.0):
        """
        Initializes the QuaternionSelfAttention module.

        Args:
            input_dim (int): The dimension of the input tensor.
            output_dim (int): The dimension of the output tensor. Must be divisible by 4 for quaternion representation.
            dropout_rate (float): Dropout rate applied to attention weights. Default is 0.0.
        """
        super(QuaternionSelfAttention, self).__init__()

        # Ensure output_dim is divisible by 4 for quaternion representation
        assert output_dim % 4 == 0, "output_dim must be divisible by 4 for quaternion representation."

        self.output_dim = output_dim
        self.input_dim = input_dim
        self.dk = self.output_dim // 4

        # Dropout rate
        self.dropout_rate = dropout_rate

        # Quaternion linear transformations for Q, K, V
        self.q_transform = QuaternionTransformation(self.output_dim, self.output_dim // 4)
        self.k_transform = QuaternionTransformation(self.output_dim, self.output_dim // 4)
        self.v_transform = QuaternionTransformation(self.output_dim, self.output_dim // 4)

    def quaternion_attention(self, a, b):
        """
        Perform dot product attention between two quaternion sequences.

        Args:
            a (torch.Tensor): The first quaternion tensor of shape [batch_size, seq_len_a, dim].
            b (torch.Tensor): The second quaternion tensor of shape [batch_size, seq_len_b, dim].

        Returns:
            Quaternion: A quaternion object containing attention matrices for each quaternion component (r, i, j, k).

        Note:
            The quaternion attention formula is based on Hamilton product:
            (rr' - xx' - yy' - zz')  +
            (rx' + xr' + yz' - zy')i +
            (ry' - xz' + yr' + zx')j +
            (rz' + xy' - yx' + zr')k
        """
        # Ensure the input dimensions are divisible by 4
        assert a.size(-1) % 4 == 0, "Last dimension of input tensor must be divisible by 4 for quaternion representation."
        assert b.size(-1) % 4 == 0, "Last dimension of input tensor must be divisible by 4 for quaternion representation."

        # Split inputs into quaternion components
        ar, ai, aj, ak = torch.chunk(a, 4, dim=-1)
        br, bi, bj, bk = torch.chunk(b, 4, dim=-1)

        # Compute the quaternion Hamilton product
        r = (torch.matmul(ar, br.transpose(-1, -2))
             - torch.matmul(ai, bi.transpose(-1, -2))
             - torch.matmul(aj, bj.transpose(-1, -2))
             - torch.matmul(ak, bk.transpose(-1, -2)))
        i = (torch.matmul(ar, bi.transpose(-1, -2))
             + torch.matmul(ai, br.transpose(-1, -2))
             + torch.matmul(aj, bk.transpose(-1, -2))
             - torch.matmul(ak, bj.transpose(-1, -2)))
        j = (torch.matmul(ar, bj.transpose(-1, -2))
             - torch.matmul(ai, bk.transpose(-1, -2))
             + torch.matmul(aj, br.transpose(-1, -2))
             + torch.matmul(ak, bi.transpose(-1, -2)))
        k = (torch.matmul(ar, bk.transpose(-1, -2))
             + torch.matmul(ai, bj.transpose(-1, -2))
             - torch.matmul(aj, bi.transpose(-1, -2))
             + torch.matmul(ak, br.transpose(-1, -2)))

        return Quaternion.from_components(r, i, j, k)

    def forward(self, X):
        """
        Forward pass of the quaternion self-attention mechanism.

        Args:
            X (torch.Tensor): Input tensor of shape [batch_size, seq_len, input_dim].

        Returns:
            torch.Tensor: Output tensor after applying quaternion self-attention, of shape [batch_size, seq_len, output_dim].
        """
        is_3d_input = X.ndim == 3
        if is_3d_input:
            batch_size, seq_len, dim = X.shape

        # Compute quaternion transformations for Q, K, V
        Q = self.q_transform(X)  # Shape: [batch_size, seq_len, output_dim]
        K = self.k_transform(X)  # Shape: [batch_size, seq_len, output_dim]
        V = self.v_transform(X)  # Shape: [batch_size, seq_len, output_dim]

        # Split V into quaternion components
        V_r, V_i, V_j, V_k = torch.chunk(V, 4, dim=-1)

        # Compute quaternion attention weights
        attention_weights = self.quaternion_attention(Q, K)

        # Apply component-wise softmax normalization
        attention_weights_r = F.softmax(attention_weights.r / torch.sqrt(torch.tensor(self.dk, dtype=torch.float32)), dim=-1)
        attention_weights_i = F.softmax(attention_weights.i / torch.sqrt(torch.tensor(self.dk, dtype=torch.float32)), dim=-1)
        attention_weights_j = F.softmax(attention_weights.j / torch.sqrt(torch.tensor(self.dk, dtype=torch.float32)), dim=-1)
        attention_weights_k = F.softmax(attention_weights.k / torch.sqrt(torch.tensor(self.dk, dtype=torch.float32)), dim=-1)

        # Apply dropout to the attention weights
        attention_weights_r = F.dropout(attention_weights_r, p=self.dropout_rate, training=self.training)
        attention_weights_i = F.dropout(attention_weights_i, p=self.dropout_rate, training=self.training)
        attention_weights_j = F.dropout(attention_weights_j, p=self.dropout_rate, training=self.training)
        attention_weights_k = F.dropout(attention_weights_k, p=self.dropout_rate, training=self.training)

        # Apply the attention weights to the V components
        attention_r = attention_weights_r @ V_r
        attention_i = attention_weights_i @ V_i
        attention_j = attention_weights_j @ V_j
        attention_k = attention_weights_k @ V_k

        # Concatenate the attended quaternion components
        attention_output = torch.cat([attention_r, attention_i, attention_j, attention_k], dim=-1)

        # If input was 3D, reshape back to original 3D shape
        if is_3d_input:
            attention_output = attention_output.view(batch_size, seq_len, -1)
        return attention_output

class MultiHeadQuaternionSelfAttention(nn.Module):
    """
    A PyTorch implementation of multi-head quaternion self-attention.
    This module extends the quaternion self-attention mechanism to multiple heads for better representation learning.

    Attributes:
        input_dim (int): The dimension of the input tensor.
        output_dim (int): The total dimension of the output tensor.
        num_heads (int): The number of attention heads.
        dropout_rate (float): The dropout rate applied to attention weights.
    """
    def __init__(self, input_dim, output_dim, num_heads=4, dropout_rate=0.0):
        """
        Initializes the MultiHeadQuaternionSelfAttention module.

        Args:
            input_dim (int): The dimension of the input tensor.
            output_dim (int): The total dimension of the output tensor. Must be divisible by 4 * num_heads.
            num_heads (int): The number of attention heads. Default is 4.
            dropout_rate (float): Dropout rate applied to attention weights. Default is 0.0.
        """
        super(MultiHeadQuaternionSelfAttention, self).__init__()

        self.output_dim = output_dim
        self.num_heads = num_heads
        self.input_dim = input_dim

        # Ensure output_dim is divisible by 4 * num_heads for quaternion-based operations
        assert self.output_dim % (4 * num_heads) == 0, "output_dim must be divisible by 4 * num_heads."

        # Compute the output dimension for each head
        self.output_dim_per_head = self.output_dim // num_heads

        # Dropout rate
        self.dropout_rate = dropout_rate

        # Create multiple QuaternionSelfAttention instances for each head
        self.attention_heads = nn.ModuleList([
            QuaternionSelfAttention(input_dim, self.output_dim_per_head, dropout_rate=self.dropout_rate) for _ in range(num_heads)
        ])

        # Linear transformation to combine the heads' outputs into the final output shape
        self.output_transform = QuaternionTransformation(self.output_dim_per_head * num_heads, output_dim // 4)

    def forward(self, X):
        """
        Forward pass for multi-head quaternion self-attention.

        Args:
            X (torch.Tensor): Input tensor of shape [batch_size, seq_len, input_dim].

        Returns:
            torch.Tensor: Output tensor after applying multi-head quaternion self-attention,
                          with shape [batch_size, seq_len, output_dim].
        """
        # Get input tensor dimensions
        batch_size, seq_len, input_dim = X.shape

        # Split the input tensor across heads along the last dimension
        head_inputs = torch.chunk(X, self.num_heads, dim=-1)  # List of [batch_size, seq_len, input_dim / num_heads]

        # Apply attention from each head and collect the outputs
        head_outputs = []
        for i, head in enumerate(self.attention_heads):
            # Pass each chunk to a separate attention head
            head_output = head(head_inputs[i])  # Shape: [batch_size, seq_len, output_dim_per_head]
            head_outputs.append(head_output)

        # Concatenate outputs from all heads along the last dimension
        multihead_output = torch.cat(head_outputs, dim=-1)  # Shape: [batch_size, seq_len, output_dim_per_head * num_heads]

        # Apply a quaternion transformation to combine the heads' outputs into the final output shape
        attention_output = self.output_transform(multihead_output)  # Shape: [batch_size, seq_len, output_dim]

        return attention_output


I dont do the full quaternion transformer here as we have notice, incoporating quaternion structures **will significantly increase computing time.**

To implement full you can go to transfomrerblock replacenn.linear with quaternion transformation

In [None]:
# 4. Model Definition

class LearnablePositionalEncoding(nn.Module):
    """
    Positional Encoding with learnable parameters.
    Adds positional information to token embeddings.
    """
    def __init__(self, max_len, dmodel, dropout, padding_idx=None):
        super(LearnablePositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.pos_encoding = nn.Parameter(torch.zeros(1, max_len, dmodel))
        nn.init.normal_(self.pos_encoding, mean=0, std=0.1)

        if padding_idx is not None:
            self.pos_encoding.data[:, padding_idx, :] = 0.0

    def forward(self, embedd):
        embedd = embedd + self.pos_encoding[:, :embedd.size(1), :]
        return self.dropout(embedd)

class TransformerBlockQuaternions(nn.Module):
    """
    A Transformer Block consisting of Multi-Head Quaternion Self-Attention and Quaternion Feed-Forward Network.
    Includes Layer Normalization and Residual Connections.
    """
    def __init__(self, dmodel, ffnn_hidden_size, num_heads, dropout):
        """
        Initializes the TransformerBlockQuaternions.

        Args:
            dmodel (int): Dimensionality of the model (must be divisible by 4 * num_heads).
            ffnn_hidden_size (int): Hidden size for the Feed-Forward Network (must be divisible by 4).
            num_heads (int): Number of quaternion attention heads.
            dropout (float): Dropout rate.
        """
        super(TransformerBlockQuaternions, self).__init__()
        self.attention = MultiHeadQuaternionSelfAttention(input_dim=dmodel, output_dim=dmodel, num_heads=num_heads, dropout_rate=dropout)
        self.layer_norm1 = nn.LayerNorm(dmodel)
        self.ffnn = nn.Sequential(
            nn.Linear(dmodel, ffnn_hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ffnn_hidden_size, dmodel)
        )
        self.layer_norm2 = nn.LayerNorm(dmodel)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass of the Transformer Block with Quaternion Self-Attention.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, seq_len, dmodel].

        Returns:
            torch.Tensor: Output tensor of shape [batch_size, seq_len, dmodel].
        """
        # Quaternion Self-Attention
        attn_out = self.attention(x)
        x = self.layer_norm1(x + self.dropout(attn_out))

        # Feed Forward Network
        ffnn_out = self.ffnn(x)
        x = self.layer_norm2(x + self.dropout(ffnn_out))
        return x

class QuaternionsTransformerModel(nn.Module):
    """
    A Quaternions-based Transformer model that integrates BERT embeddings with Quaternion Self-Attention and Feed-Forward Networks.
    """
    def __init__(self, pretrained_model_name='bert-base-uncased', output_size=3,
                 n_layers=6, ffnn_hidden_size=None, num_heads=4, dropout=0.1, pooling='max'):
        """
        Initializes the QuaternionsTransformerModel.

        Args:
            pretrained_model_name (str): Name of the pre-trained BERT model.
            output_size (int): Number of output classes.
            n_layers (int): Number of Transformer blocks.
            ffnn_hidden_size (int, optional): Hidden size for the Feed-Forward Network. Defaults to dmodel * 4.
            num_heads (int): Number of quaternion attention heads.
            dropout (float): Dropout rate.
            pooling (str): Pooling method ('max' or 'avg').
        """
        super(QuaternionsTransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(dropout)
        self.pooling = pooling

        dmodel = self.bert.config.hidden_size  # Typically 768 for bert-base

        if ffnn_hidden_size is None:
            ffnn_hidden_size = dmodel * 4

        # Initialize Positional Encoding
        self.pos_encoding = LearnablePositionalEncoding(max_len=512, dmodel=dmodel, dropout=dropout, padding_idx=0)

        # Initialize Transformer Blocks with Quaternion Self-Attention
        self.blocks = nn.ModuleList([
            TransformerBlockQuaternions(dmodel=dmodel, ffnn_hidden_size=ffnn_hidden_size, num_heads=num_heads, dropout=dropout)
            for _ in range(n_layers)
        ])

        # Initialize Layer Normalization
        self.layer_norm = nn.LayerNorm(dmodel)

        # Final Classification Layer
        self.fc = nn.Linear(dmodel, output_size)

    def forward(self, input_ids, attention_mask):
        """
        Forward pass of the QuaternionsTransformerModel.

        Args:
            input_ids (torch.Tensor): Input IDs from BERT tokenizer.
            attention_mask (torch.Tensor): Attention mask from BERT tokenizer.

        Returns:
            torch.Tensor: Logits for each class.
        """
        # Get BERT embeddings
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]

        # Apply Positional Encoding
        x = self.pos_encoding(x)  # Shape: [batch_size, seq_len, hidden_size]

        # Apply Dropout
        x = self.dropout(x)

        # Apply Transformer Blocks
        for block in self.blocks:
            x = block(x)  # Shape remains [batch_size, seq_len, hidden_size]

        # Apply Layer Normalization
        x = self.layer_norm(x)

        # Pooling
        if self.pooling == 'max':
            x = F.adaptive_max_pool1d(x.permute(0, 2, 1), 1).view(x.size(0), -1)
        else:  # Average Pooling
            # To avoid division by zero, add a small epsilon
            x = torch.sum(x, dim=1) / (attention_mask.sum(dim=1).unsqueeze(1).float() + 1e-8)

        # Final Classification Layer
        logits = self.fc(x)
        return logits


In [None]:
# 5. Data Loading

def create_dataloader(input_ids, attention_mask, labels, batch_size):
    """
    Creates a PyTorch DataLoader from input IDs, attention masks, and labels.

    Args:
        input_ids (torch.Tensor): Tensor of input IDs.
        attention_mask (torch.Tensor): Tensor of attention masks.
        labels (torch.Tensor): Tensor of labels.
        batch_size (int): The batch size for the DataLoader.

    Returns:
        DataLoader: A DataLoader for the given data.
    """
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
#  6. Training & Evaluation

def train_epoch(model, dataloader, criterion, optimizer, device):
    """
    Trains the model for one epoch.

    Args:
        model (nn.Module): The model to train.
        dataloader (DataLoader): DataLoader for training data.
        criterion (nn.Module): Loss function.
        optimizer (torch.optim.Optimizer): Optimizer.
        device (torch.device): Device to run the training on.

    Returns:
        float: Average loss over the epoch.
    """
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

def eval_model(model, dataloader, criterion, device):
    """
    Evaluates the model on a validation or test set.

    Args:
        model (nn.Module): The model to evaluate.
        dataloader (DataLoader): DataLoader for validation/test data.
        criterion (nn.Module): Loss function.
        device (torch.device): Device to run the evaluation on.

    Returns:
        tuple: Average loss and accuracy.
    """
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            correct_preds += torch.sum(preds == labels)
            total_preds += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_preds.double() / total_preds
    return avg_loss, accuracy.item(), all_preds, all_labels


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
# 7. Main Workflow


# Replace 'your_dataset.csv' with your actual dataset file path.
# The dataset should have at least two columns: 'Sentence' and 'Sentiment'.
# 'Sentiment' should contain labels like 'neutral', 'positive', 'negative'.

# Example:
# org_data = pd.read_csv('your_dataset.csv')

# import data
# org_data = pd.read_csv('data.csv')
'''
org_data= pd.read_parquet('train-00000-of-00001.parquet', engine='pyarrow')
mapping = {0: 'neutral', 1: 'positive', 2: 'negative'}
org_data.rename(columns={'label': 'Sentiment', 'text': 'Sentence'}, inplace=True)
org_data['Sentiment'] = org_data['Sentiment'].map(mapping)
'''

org_data = pd.read_csv('data.csv')

# Initialize the TextPreprocessor
preprocessor = TextPreprocessor(max_len=50)
(train_input_ids, train_attention_mask, train_labels,
  val_input_ids, val_attention_mask, val_labels,
  test_input_ids, test_attention_mask, test_labels) = preprocessor.preprocess_dataset(
    org_data, text_column='Sentence', label_column='Sentiment'
)

# Create DataLoader
batch_size = 64  # Adjust based on your hardware capabilities

train_loader = create_dataloader(train_input_ids, train_attention_mask, train_labels, batch_size)
val_loader = create_dataloader(val_input_ids, val_attention_mask, val_labels, batch_size)
test_loader = create_dataloader(test_input_ids, test_attention_mask, test_labels, batch_size)

# Initialize Model, Loss, Optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model parameters
pretrained_model_name = 'bert-base-uncased'
output_size = 3  # Number of classes: neutral, positive, negative
n_layers = 2     # Number of Transformer blocks; increase for better performance
ffnn_hidden_size = None  # If None, defaults to dmodel * 4
num_heads = 4    # Number of attention heads
dropout = 0.1
pooling = 'max'  # 'max' or 'avg'

# Initialize the model
model = QuaternionsTransformerModel(
    pretrained_model_name=pretrained_model_name,
    output_size=output_size,
    n_layers=n_layers,
    ffnn_hidden_size=ffnn_hidden_size,
    num_heads=num_heads,
    dropout=dropout,
    pooling=pooling
)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# ----------------------------
# Training Loop: Warning Quaternion Operation will significantlt increase operating times

num_epochs = 10
best_val_accuracy = 0

import time


start_time = time.time()
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_accuracy, val_preds, val_labels_true = eval_model(model, val_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")

    # Save the model if validation accuracy improves
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_quaternions_transformer.pt')
        print("Model saved.")
end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f} seconds")
# ----------------------------
# Evaluation on Test Set

# Load the best model
model.load_state_dict(torch.load('best_quaternions_transformer.pt'))

test_loss, test_accuracy, test_preds, test_labels_true = eval_model(model, test_loader, criterion, device)
print(f"\nTest Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(test_labels_true, test_preds, target_names=['neutral', 'positive', 'negative']))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(test_labels_true, test_preds))

print(count_parameters(model))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Cleaning text data...
Encoding labels...
Splitting dataset into train, validation, and test sets...
Tokenizing and encoding text data...
Train set: 3505 samples
Validation set: 1168 samples
Test set: 1169 samples
Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Epoch 1/10




Train Loss: 0.9614
Validation Loss: 0.9904 | Validation Accuracy: 0.5993
Model saved.

Epoch 2/10




Train Loss: 0.6961
Validation Loss: 0.7212 | Validation Accuracy: 0.7080
Model saved.

Epoch 3/10




Train Loss: 0.4886
Validation Loss: 0.5289 | Validation Accuracy: 0.7603
Model saved.

Epoch 4/10




Train Loss: 0.3778
Validation Loss: 0.6074 | Validation Accuracy: 0.7783
Model saved.

Epoch 5/10




Train Loss: 0.2939
Validation Loss: 0.6313 | Validation Accuracy: 0.7765

Epoch 6/10




Train Loss: 0.2491
Validation Loss: 0.6080 | Validation Accuracy: 0.7851
Model saved.

Epoch 7/10




Train Loss: 0.1941
Validation Loss: 0.7208 | Validation Accuracy: 0.7954
Model saved.

Epoch 8/10




Train Loss: 0.1723
Validation Loss: 0.8705 | Validation Accuracy: 0.7731

Epoch 9/10




Train Loss: 0.1592
Validation Loss: 0.7122 | Validation Accuracy: 0.7688

Epoch 10/10


  model.load_state_dict(torch.load('best_quaternions_transformer.pt'))


Train Loss: 0.1680
Validation Loss: 0.7441 | Validation Accuracy: 0.7808
Total training time: 376.34 seconds


                                                           


Test Loss: 0.7829 | Test Accuracy: 0.7887

Classification Report:
              precision    recall  f1-score   support

     neutral       0.78      0.92      0.84       626
    positive       0.82      0.83      0.82       371
    negative       0.74      0.22      0.33       172

    accuracy                           0.79      1169
   macro avg       0.78      0.66      0.67      1169
weighted avg       0.78      0.79      0.76      1169

Confusion Matrix:
[[578  43   5]
 [ 56 307   8]
 [109  26  37]]
119846403




In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
confusion_matrix=np.array([[482,56,88],[38,318,15],[45,23,104]])
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=["Neutral","Positive","Negative"])
plt.figure(figsize=(10,10),dpi=200)
disp.plot(cmap=plt.cm.Blues)
plt.title("Quaternion Transformer Model Confusion Matrix")
plt.show()