In [4]:
import json
import math
from collections import OrderedDict
import torch
from torch import nn, Tensor
from typing import Union, Tuple, List, Iterable, Dict
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.optim import AdamW
from torch.utils.data import DataLoader
from scipy.stats import pearsonr, spearmanr
import numpy as np
import gzip, csv
import pandas as pd
from tqdm.auto import tqdm

torch.manual_seed(0)
np.random.seed(0)

In [None]:
! pip3 install datasets

In [None]:
%pip install transformers
from transformers import AutoTokenizer
# If you can not find all the bugs, use the line below for AutoModel
#from transformers import AutoModel


In [None]:
! pip3 install avalanche-lib

In [None]:
! pip3 install torch transformers

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from collections import OrderedDict

def gelu(x):
    """Implementation of the gelu activation function."""
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class Config(object):
    """Configuration class to store the configuration of a `BertModel`."""
    def __init__(self,
                 vocab_size,
                 hidden_size=512,
                 num_hidden_layers=6,
                 num_attention_heads=8,
                 intermediate_size=2048,
                 dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02):
        """Constructs Config for BertModel."""
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = dropout_prob
        self.attention_probs_dropout_prob = dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range

    @classmethod
    def from_dict(cls, dict_object):
        """Constructs Config from a Python dictionary of parameters."""
        config = Config(vocab_size=None)
        for (key, value) in dict_object.items():
            config.__dict__[key] = value
        return config

class LayerNorm(nn.Module):
    """Layer normalization module."""
    def __init__(self, hidden_size, variance_epsilon=1e-12):
        """Constructs LayerNorm object for Transformer layer in BERT model."""
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_size))
        self.beta = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = variance_epsilon

    def forward(self, x):
        """Forward pass of the LayerNorm layer."""
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta

class MLP(nn.Module):
    """Feed forward network with gelu activation."""
    def __init__(self, hidden_size, intermediate_size):
        """Constructs MLP object for Transformer layer in BERT model."""
        super(MLP, self).__init__()
        self.dense_expansion = nn.Linear(hidden_size, intermediate_size)
        self.dense_contraction = nn.Linear(intermediate_size, hidden_size)

    def forward(self, x):
        """Forward pass of the MLP layer."""
        x = self.dense_expansion(x)
        x = self.dense_contraction(gelu(x))
        return x

class Layer(nn.Module):
    """The Transformer layer."""
    def __init__(self, config):
        """Constructs Layer object for Transformer layer in BERT model based on config."""
        super(Layer, self).__init__()

        self.hidden_size = config.hidden_size
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        self.attn_out = nn.Linear(config.hidden_size, config.hidden_size)
        self.ln1 = LayerNorm(config.hidden_size)

        self.mlp = MLP(config.hidden_size, config.intermediate_size)
        self.ln2 = LayerNorm(config.hidden_size)

    def split_heads(self, tensor, num_heads, attention_head_size):
        """Split hidden_size into num_heads * attention_head_size and transpose into shape [batch, num_heads, seq_len, attention_head_size]."""
        new_shape = tensor.size()[:-1] + (num_heads, attention_head_size)
        tensor = tensor.view(*new_shape)
        return tensor.permute(0, 2, 1, 3).contiguous()

    def merge_heads(self, tensor, num_heads, attention_head_size):
        """Transpose and then reshape into shape [batch, seq_len, hidden_size]."""
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        new_shape = tensor.size()[:-2] + (num_heads * attention_head_size,)
        return tensor.view(new_shape)

    def attn(self, q, k, v, attention_mask):
        """Attention mechanism for the Transformer layer."""
        mask = attention_mask == 1
        mask = mask.unsqueeze(1).unsqueeze(2)

        s = torch.matmul(q, k.transpose(-1, -2))
        s = s / math.sqrt(self.attention_head_size)

        s = torch.where(mask, s, torch.tensor(float('-inf')))

        p = F.softmax(s, dim=-1)
        p = self.dropout(p)

        a = torch.matmul(p, v)
        return a

    def forward(self, x, attention_mask):
        """Forward pass of the Transformer layer in BERT."""
        q, k, v = self.query(x), self.key(x), self.value(x)

        q = self.split_heads(q, self.num_attention_heads, self.attention_head_size)
        k = self.split_heads(k, self.num_attention_heads, self.attention_head_size)
        v = self.split_heads(v, self.num_attention_heads, self.attention_head_size)

        a = self.attn(q, k, v, attention_mask)
        a = self.merge_heads(a, self.num_attention_heads, self.attention_head_size)
        a = self.attn_out(a)
        a = self.dropout(a)
        a = self.ln1(a + x)

        m = self.mlp(a)
        m = self.dropout(m)
        m = self.ln2(m + a)

        return m

class Bert(nn.Module):
    def __init__(self, config_dict):
        super(Bert, self).__init__()
        self.config = Config.from_dict(config_dict)  # Create an instance of Config

        self.embeddings = nn.ModuleDict({
            'token': nn.Embedding(self.config.vocab_size, self.config.hidden_size, padding_idx=0),
            'position': nn.Embedding(self.config.max_position_embeddings, self.config.hidden_size),
            'token_type': nn.Embedding(self.config.type_vocab_size, self.config.hidden_size),
        })

        self.ln = LayerNorm(self.config.hidden_size)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)

        self.layers = nn.ModuleList([
            Layer(self.config) for _ in range(self.config.num_hidden_layers)
        ])

        self.pooler = nn.Sequential(OrderedDict([
            ('dense', nn.Linear(self.config.hidden_size, self.config.hidden_size)),
            ('activation', nn.Tanh()),
        ]))

        # Add a classifier layer for classification
        self.classifier = nn.Linear(self.config.hidden_size, 20)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        position_ids = torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        x = (self.embeddings['token'](input_ids) +
             self.embeddings['position'](position_ids) +
             self.embeddings['token_type'](token_type_ids))
        x = self.dropout(self.ln(x))

        for layer in self.layers:
            x = layer(x, attention_mask)

        o = self.pooler(x[:, 0])

        if labels is not None:
          print("HELLO WORLD")
          # Use the classifier layer for classification
          logits = F.softmax(self.classifier(o), dim=1)
          #loss = nn.CrossEntropyLoss()(logits, labels)
          return logits

        return x, o

    def save_model(self, path):
        """Save model to a file.

        Args:
            path (str): Path to the file where the model will be saved.
        """
        torch.save(self.state_dict(), path)

    @classmethod
    def load_model(cls, config_dict, path):
        """Load model from a file.

        Args:
            config_dict (dict): Dictionary containing the configuration of the model.
            path (str): Path to the model checkpoint.

        Returns:
            Bert: Bert model loaded from the checkpoint.
        """
        model = cls(config_dict)
        model.load_state_dict(torch.load(path))
        return model


In [10]:
from transformers import AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch
from transformers import BertTokenizer

# Define your custom BERT configuration
config_dict = {
    'vocab_size': 310000,
    'hidden_size': 512,
    'num_attention_heads': 2,
    'num_hidden_layers': 4,
    'intermediate_size': 512,
    'dropout_prob': 0.1,
    'max_position_embeddings': 512,
    'type_vocab_size': 2,
    'initializer_range': 0.02
}

# Create an instance of your custom BERT model
model = Bert(config_dict)

# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Replace with your desired tokenizer

# Define a DataLoader for the dataset
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item['text'], truncation=True, padding='max_length', return_tensors='pt', max_length=512)
        encoding['label'] = torch.tensor(item['label'])
        return encoding

# Use DynamicPaddingCollate for DataLoader
class DynamicPaddingCollate:
    def __call__(self, batch):
        return {
            'input_ids': torch.stack([sample['input_ids'].squeeze(0) for sample in batch]),
            'attention_mask': torch.stack([sample['attention_mask'].squeeze(0) for sample in batch]),
            'labels': torch.tensor([sample['label'] for sample in batch])
        }

# Load the dataset
dataset = load_dataset("setfit/20_newsgroups", split="train")

# Create DataLoader with DynamicPaddingCollate
train_dataset = MyDataset(dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=DynamicPaddingCollate())

# Training parameters
epochs = 2
learning_rate = 2e-5

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
criterion = nn.CrossEntropyLoss()
# Training loop


for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_dataloader, 1):
        optimizer.zero_grad()
        input_ids =batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(**batch)
        # print(outputs)
        # check = outputs
        # break
        #loss =
        loss = criterion(outputs, labels)  # Assuming your forward method returns the loss directly
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_idx % 10 == 0:  # Print every 10 batches
            avg_loss = total_loss / batch_idx
            print(f"Batch {batch_idx}/{len(train_dataloader)} - Avg Loss: {avg_loss:.4f}")

    avg_epoch_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} - Avg Loss: {avg_epoch_loss:.4f}")

    scheduler.step()







Epoch 1/2
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
Batch 10/354 - Avg Loss: 2.9962
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
Batch 20/354 - Avg Loss: 2.9954
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
Batch 30/354 - Avg Loss: 2.9950
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
HELLO WORLD
Batch 40/354 - Avg Loss: 2.9953
HELLO WORLD
HELLO WORLD


KeyboardInterrupt: 

In [27]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
print(input)
print(target)
output = loss(input, target)
print("one",output)
output.backward()
print("one",output)
#Example of target with class probabilities
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.randn(3, 5).softmax(dim=1)
# print(target)
# output = loss(input, target)
# output.backward()
# print("two",output)

tensor([[-0.2599,  1.7095, -1.8641,  0.4603,  1.0727],
        [ 1.3016,  0.2321, -1.3055, -1.0979,  0.3024],
        [-0.4994,  0.4463, -2.0962,  1.7518,  1.2974]], requires_grad=True)
tensor([2, 4, 0])
one tensor(2.9490, grad_fn=<NllLossBackward0>)
one tensor(2.9490, grad_fn=<NllLossBackward0>)


In [23]:
outputs

tensor([[0.0563, 0.0574, 0.0524, 0.0411, 0.0941, 0.0515, 0.0538, 0.0506, 0.0443,
         0.0489, 0.0426, 0.0487, 0.0561, 0.0322, 0.0470, 0.0469, 0.0438, 0.0522,
         0.0449, 0.0352],
        [0.0521, 0.0482, 0.0492, 0.0369, 0.0977, 0.0582, 0.0564, 0.0400, 0.0496,
         0.0411, 0.0519, 0.0455, 0.0738, 0.0366, 0.0482, 0.0340, 0.0500, 0.0406,
         0.0575, 0.0323],
        [0.0590, 0.0494, 0.0491, 0.0429, 0.0783, 0.0548, 0.0565, 0.0452, 0.0483,
         0.0544, 0.0437, 0.0416, 0.0769, 0.0388, 0.0457, 0.0445, 0.0484, 0.0401,
         0.0505, 0.0319],
        [0.0510, 0.0491, 0.0431, 0.0398, 0.1017, 0.0448, 0.0531, 0.0348, 0.0533,
         0.0454, 0.0487, 0.0429, 0.0787, 0.0329, 0.0516, 0.0436, 0.0493, 0.0456,
         0.0612, 0.0293],
        [0.0507, 0.0465, 0.0489, 0.0376, 0.0945, 0.0496, 0.0660, 0.0351, 0.0511,
         0.0464, 0.0428, 0.0453, 0.0753, 0.0330, 0.0497, 0.0458, 0.0557, 0.0445,
         0.0510, 0.0305],
        [0.0600, 0.0488, 0.0499, 0.0383, 0.0946, 0.0540, 0.0

In [22]:
magic = torch.argmax(outputs, dim = 1)
magic

tensor([ 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        12,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4])

In [None]:

# Save the model
model.save_model("/content/my_custom_bert_model.pth")

In [None]:
# Load the model for predictions
loaded_model = Bert.load_model(config_dict, "/content/my_custom_bert_model.pth")

# Perform predictions
sentence = "This is a test sentence."
input_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
output = loaded_model(input_ids)
print(output)

In [14]:
check[0]

tensor([0.0868, 0.0667, 0.0448, 0.0425, 0.0855, 0.0469, 0.0439, 0.0399, 0.0453,
        0.0427, 0.0820, 0.0357, 0.0340, 0.0547, 0.0405, 0.0406, 0.0423, 0.0527,
        0.0329, 0.0395], grad_fn=<SelectBackward0>)

In [None]:
prediction = torch.argmax(check[0])
prediction

tensor(1)

In [None]:
check[0]

tensor([-0.3700,  0.4549,  0.0396,  0.1715,  0.2064,  0.1312,  0.3936, -0.0831,
         0.2345, -0.2406, -0.2414,  0.3022, -0.1611, -0.3026, -0.0525,  0.0938,
         0.2632, -0.0264,  0.3938,  0.2777], grad_fn=<SelectBackward0>)

In [None]:
# torch.save(model.state_dict(), 'model_weights.pth')
# model = MyModel()  # Make sure this is the same model architecture
# model.load_state_dict(torch.load('model_weights.pth'))
# model.eval()

tensor([17,  4,  9, 13, 19, 16,  5,  1,  6,  6, 12,  3,  4, 19, 10,  4,  1, 14,
         5, 13, 19, 17,  0, 19, 16, 12,  0, 10,  6,  3,  7,  0])

In [None]:
# Access the 19th row in the "text" column
text_19th_row = dataset['train']['text'][18]

print(text_19th_row)

In [19]:
for batch_idx, batch in enumerate(train_dataloader, 1):
  print(**batch)
  break

TypeError: 'input_ids' is an invalid keyword argument for print()