In [None]:
pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.6 MB/s[0m

In [None]:
# Start by importing necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as functional
from transformers import AutoTokenizer, AutoModel

In [None]:
from datasets import load_dataset
import random
import numpy as np
import torch.optim as optim
from sklearn.metrics import accuracy_score
from transformers import  DataCollatorWithPadding
from torch.utils.data import DataLoader

# ABCNN

ABCNN is a type of neural network architecture that utilizes attention mechanisms to model relationship between pairs of text sequences. The ABCNN model is specifically designed for text classification tasks where the input data consists of two sequences of text, such as a question and an answer in a question answering task.

The ABCNN architecture consists of two main components: the embedding layer and the convolutional layer. The embedding layer maps the input text to a dense vector representation, while the convolutional layer applies a set of filters to capture different features of the input sequences. The output of the convolutional layer is then passed through an attention mechanism that calculates the similarity between the two sequences and produces a weighted representation of the two sequences based on this similarity.


The ABCNN architecture can be implemented in several variants: ABCNN1, ABCNN2 and ABCNN3.


*   In ABCNN1, the attention mechanism is applied to the convolutional layer output.
*   In ABCNN2, the attention mechanism is applied to the concatenation of the convolutional layer output and the input sequence embeddings.
*   ABCNN3 is a more complex variant that uses recursive neural network to model the relationship between different pairs of text sequences.

A sample architecture of ABCNN-3 is shown in code below.



In [None]:
class ABCNN3(nn.Module):
    def __init__(self, model_name, num_classes):
        super(ABCNN3, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.embedding = AutoModel.from_pretrained(model_name)
        self.rnn = nn.GRU(input_size=self.embedding.config.hidden_size, hidden_size=128, num_layers=1,
                bidirectional=True, batch_first=True)
        self.attention = nn.Linear(256, 1)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, inputs):
        x1_encoded = self.embedding(inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids']).last_hidden_state.squeeze(1)
        x2_encoded = self.embedding(inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids']).last_hidden_state.squeeze(1)

        _, x1_h = self.rnn(x1_encoded)
        _, x2_h = self.rnn(x2_encoded)

        x1_h = torch.cat([x1_h[0], x1_h[1]], dim=1)
        x2_h = torch.cat([x2_h[0], x2_h[1]], dim=1)

        x1_att = self.attention(x1_h).unsqueeze(1)
        x2_att = self.attention(x2_h).unsqueeze(1)

        x1_weighted = torch.bmm(torch.transpose(x1_att, 1, 2), x1_h.unsqueeze(1)).squeeze()
        x2_weighted = torch.bmm(torch.transpose(x2_att, 1, 2), x2_h.unsqueeze(1)).squeeze()

        #print(x1_weighted.shape,x2_weighted.shape)
        x = torch.cat([x1_weighted, x2_weighted], dim=1)
        #print(x.shape)
        logits = self.fc(x)

        return logits

**NOTE**: In the above model, two inputs in the forward class (x1,x2) is taken instead of one since we want to train the model for text classification and  we need to compare two input sentences and predict whether they are related or not.


*  In the given code, x1_encoded and x2_encoded are the encoded representations of the two input sentences obtained using the pre-trained transformer-based embedding model. The GRU layer is then applied on these encoded representations to capture the contextual information.

*  The attention mechanism is applied on the output of the GRU layer to obtain the weighted representations of the input sentences. These weighted representations are concatenated and passed through a fully connected layer to obtain the final output logits.


*  Hence, taking two input sentences is necessary in this model as it is designed for sentence pair classification tasks, and it needs to compare and analyze the relationships between two sentences.

In [None]:
## Load the SST-5 dataset
dataset = load_dataset('glue','sst2')

## Split the dataset into train, validation and test sets
train_dataset = dataset['train'].shuffle(seed=42)
val_dataset = dataset['validation'].shuffle(seed=42)
test_dataset = dataset['test'].shuffle(seed=42)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model_name = 'bert-base-uncased'
num_classes = len(dataset['train'].features['label'].names)
model = ABCNN3(model_name, num_classes)

## Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

## Set up the device if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ABCNN3(
  (embedding): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
## Define a function to tokenize and encode the text sequences
def encode_text(batch):
    return tokenizer(batch['sentence'])

## Define a function to compute accuracy
def compute_accuracy(preds, targets):
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(targets, preds)
    return acc

In [None]:
## Train the model 
num_epochs =5
batch_size = 32

tokenizer = AutoTokenizer.from_pretrained(model_name)
collate_fn = DataCollatorWithPadding(tokenizer)

train_dataset = train_dataset.map(encode_text, batched=True)
val_dataset = val_dataset.map(encode_text, batched=True)

def clean_dataset(ds):
    ds = ds.remove_columns(['sentence','idx'])
    ds = ds.rename_columns({'label':'labels'})
    ds.set_format('torch')
    return ds

train_dataset = clean_dataset(train_dataset)
val_dataset = clean_dataset(val_dataset)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [None]:
for epoch in range(num_epochs):
    model.train()
    train_losses = []
    train_accs = []
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        acc = compute_accuracy(outputs.detach().cpu().numpy(), labels.cpu().numpy())
        train_losses.append(loss.item())
        train_accs.append(acc)
    train_loss = np.mean(train_losses)
    train_acc = np.mean(train_accs)

    ## validates
    model.eval()
    val_losses, val_accs = [], []
    for i, batch in enumerate(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids}
        with torch.no_grad():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs.detach().cpu().numpy(), labels.cpu().numpy())
            val_losses.append(loss.item())
            val_accs.append(acc)
    val_loss = np.mean(val_losses)
    val_acc = np.mean(val_accs)

    ## Print the progress
    print(f'Epoch:{epoch+1} / {num_epochs}:')
    print(f'Train loss:{train_loss:.4f} | Train acc: {train_acc:.4f}')
    print(f'Val loss:{val_loss:.4f} | Val acc: {val_acc:.4f}')

Epoch:1 / 5:
Train loss:0.0982 | Train acc: 0.9669
Val loss:0.1975 | Val acc: 0.9185
Epoch:2 / 5:
Train loss:0.0629 | Train acc: 0.9781
Val loss:0.2903 | Val acc: 0.9196
Epoch:3 / 5:
Train loss:0.0448 | Train acc: 0.9843
Val loss:0.2756 | Val acc: 0.9263
Epoch:4 / 5:
Train loss:0.0339 | Train acc: 0.9876
Val loss:0.3799 | Val acc: 0.9196
Epoch:5 / 5:
Train loss:0.0285 | Train acc: 0.9897
Val loss:0.3100 | Val acc: 0.9118
