In [1]:
import json
import math
from collections import OrderedDict
import torch
from torch import nn, Tensor
from typing import Union, Tuple, List, Iterable, Dict
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.optim import AdamW
from torch.utils.data import DataLoader
#from scipy.stats import pearsonr, spearmanr
import numpy as np
import gzip, csv
import pandas as pd
from tqdm.auto import tqdm
import torch.nn.init as init
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def gelu(x):
    """Implementation of the gelu activation function."""
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, embed_dim: int, drop_rate=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=drop_rate)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        #print("IN POSTIONAL FORWARD")
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [4]:
#def scaled_dot_product(q, k, v, attn_drop_rate=0.1, mask=None):
def scaled_dot_product(q, k, v, attn_drop_rate=0.1):
    """
    Parameters:
      q: query, shape: (batch, # heads, seq len, head dimension)
      k: keys, shape: (batch, # heads, seq len, head dimension)
      v: value, shape: (batch, # heads, seq len, head dimension)
      attn_drop_rate: probability of an element to be zeroed,
      mask: the optional masking of specific entries in the attention matrix.
              shape: (batch, seq len)
    """
    
    d_k = q.shape[-1]
    attn_logits = torch.matmul(q, k.transpose(-1, -2))
    attn_logits = attn_logits/math.sqrt(d_k)
    # if mask is not None:
    #   dummy_mask = torch.where(mask == 1.0, torch.tensor(True), torch.tensor(False))
    #   attn_logits = attn_logits.masked_fill(dummy_mask.unsqueeze(1).unsqueeze(1), float('-inf'))
    attention = F.softmax(attn_logits, dim=-1)
    attention = F.dropout(attention)
    values = torch.matmul(attention,v)
    return values

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, n_heads, attn_drop_rate):
        super().__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.head_dim = embed_dim // n_heads
        self.attn_drop_rate = attn_drop_rate
        self.query = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.key = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.value = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self.o_proj = nn.Linear(self.embed_dim, self.n_heads*self.head_dim)
        self._reset_parameters()

    def _reset_parameters(self):
      nn.init.xavier_uniform_(self.query.weight)
      self.query.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.key.weight)
      self.key.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.value.weight)
      self.value.bias.data.fill_(0)
      nn.init.xavier_uniform_(self.o_proj.weight)
      self.o_proj.bias.data.fill_(0)

    def split_heads(self, tensor):
       new_shape = tensor.size()[:-1] + (self.n_heads, self.head_dim)
       tensor = tensor.view(*new_shape)
       tensor = tensor.permute(0, 2, 1, 3).contiguous()
       return tensor
    
    def merge_heads(self, tensor, batch_size, seq_length):
       tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embed_dim)
       return tensor
    
    def forward(self, embedding):
       print("ATTENTION",embedding.size())
       batch_size, seq_length, embed_dim = embedding.size()
       q, k, v = self.query(embedding), self.key(embedding), self.value(embedding)
       q = self.split_heads(q)
       k = self.split_heads(k)
       v = self.split_heads(v)
       values = scaled_dot_product(q, k, v, self.attn_drop_rate)
       values = self.merge_heads(values, batch_size, seq_length)
       attended_embeds = self.o_proj(values)

       return attended_embeds

    


In [6]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        #print(f"Mean ({mean.size()})")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        #print(f"Standard Deviation  ({std.size()})")
        y = (inputs - mean) / std
        #print(f"y: {y.size()}")
        out = self.gamma * y  + self.beta
        #print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        #print(f"out: {out.size()}")
        return out

In [7]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, embed_dim, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, 4*embed_dim)
        self.linear2 = nn.Linear(4*embed_dim, embed_dim)
        self.relu = nn.GELU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        #print(f"x after dropout: {x.size()}")
        x = self.linear2(x)
        #print(f"x after 2nd linear layer: {x.size()}")
        return x

In [8]:
# class Classifier: #mewmew
class Classifier(nn.Module):
    def __init__(self, input_dim, numclasses, dropout_rate=0.1):
        super(Classifier, self).__init__()

        # Define layers
        # self.fc1 = nn.Linear(input_dim, input_dim)
        # self.relu = nn.ReLU()
        # self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(input_dim, numclasses)

    def forward(self, x):
        # Forward pass
        # x = self.fc1(x)
        # x = self.relu(x)
        # x = self.dropout(x)
        x = self.fc2(x)
        return x

In [9]:
class EncoderLayer(nn.Module):

    def __init__(self, embed_dim, n_heads, attn_drop_rate, layer_drop_rate):
        super(EncoderLayer, self).__init__()
        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.attention = MultiHeadAttention(self.embed_dim, self.n_heads, attn_drop_rate)
        self.norm1 = LayerNormalization(parameters_shape=[self.embed_dim])
        self.dropout1 = nn.Dropout(p=layer_drop_rate)
        self.ffn = PositionwiseFeedForward(self.embed_dim,layer_drop_rate)
        self.norm2 = LayerNormalization(parameters_shape=[self.embed_dim])
        self.dropout2 = nn.Dropout(p=layer_drop_rate)

    def forward(self, x):
        #print("IN ENCODER FORWARD",x.shape, x.size())
        residual_x = x
        #print("------- ATTENTION 1 ------")
        x = self.attention(x)
        #print("------- DROPOUT 1 ------")
        x = self.dropout1(x)
        #print("------- ADD AND LAYER NORMALIZATION 1 ------")
        x = x + residual_x
        x = self.norm1(x)
        residual_x = x
        #print("------- ATTENTION 2 ------")
        x = self.ffn(x)
        #print("------- DROPOUT 2 ------")
        x = self.dropout2(x)
        #print("------- ADD AND LAYER NORMALIZATION 2 ------")
        x = x + residual_x
        x = self.norm2(x)
        #add and norm switch refer to assignment
        return x

In [11]:
embed_dim = 16
n_heads = 4
attn_drop_rate = 0.1
layer_drop_rate = 0.1
block = EncoderLayer(embed_dim, n_heads, attn_drop_rate, layer_drop_rate)

bs = 3
seq_len = 2
embeds = torch.randn(bs, seq_len, embed_dim)
outputs = block(embeds)
out_bs, out_seq_len, out_hidden = outputs.shape
print("Output shape: ", (out_bs, out_seq_len, out_hidden))
assert out_bs == bs and out_seq_len == seq_len and out_hidden == embed_dim, "Unexpected output shape"

IN ENCODER FORWARD torch.Size([3, 2, 16]) torch.Size([3, 2, 16])
------- ATTENTION 1 ------
ATTENTION torch.Size([3, 2, 16])
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean (torch.Size([3, 2, 1]))
Standard Deviation  (torch.Size([3, 2, 1]))
y: torch.Size([3, 2, 16])
self.gamma: torch.Size([16]), self.beta: torch.Size([16])
out: torch.Size([3, 2, 16])
------- ATTENTION 2 ------
x after dropout: torch.Size([3, 2, 64])
x after 2nd linear layer: torch.Size([3, 2, 16])
------- DROPOUT 2 ------
------- ADD AND LAYER NORMALIZATION 2 ------
Mean (torch.Size([3, 2, 1]))
Standard Deviation  (torch.Size([3, 2, 1]))
y: torch.Size([3, 2, 16])
self.gamma: torch.Size([16]), self.beta: torch.Size([16])
out: torch.Size([3, 2, 16])
Output shape:  (3, 2, 16)


In [14]:
class BERT(nn.Module): #Transformer
    def __init__(self, n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate):
        super().__init__()
        self.embed = nn.Embedding(vocab_size+1, embed_dim)
        self.position = PositionalEncoding(embed_dim, layer_drop_rate)
        self.net = nn.Sequential(*[
        EncoderLayer(embed_dim, n_heads, attn_drop_rate, layer_drop_rate) for _ in range(n_layers)
        ])
        #self.mask_pred = nn.Linear(embed_dim, vocab_size) ## Classifier
        self.pooler = nn.Sequential(OrderedDict([
            ('dense', nn.Linear(embed_dim, embed_dim)),
            ('activation', nn.Tanh()),
        ]))
        self.classifier = Classifier(embed_dim, num_classes)
    def forward(self, batch_text):
        #print("in model forward", batch_text.size())
        embedding = self.position(self.embed(batch_text))
        #print("POST Postional", embedding.size())
        new_embedding = self.net((embedding))
        #print("POST ENCODERS", new_embedding.size())
        o = self.pooler(new_embedding[:, 0])
        #print(o.shape)
        preds = self.classifier(o)
        return preds
        


In [16]:
embed_dim = 16
n_heads = 4
n_layers = 2
vocab_size = 10
attn_drop_rate = 0.1
layer_drop_rate = 0.1
num_classes=20
model = BERT(n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate)

bs = 3
seq_len = 2
inputs = torch.randint(0, vocab_size, (bs, seq_len))
mask_preds = model(inputs)
print(mask_preds.shape)
#out_bs, out_seq_len, out_vocab = mask_preds.shape
#print("Mask predictions shape: ", (out_bs, out_seq_len, out_vocab))
#print(mask_preds)
#assert out_bs == bs and out_seq_len == seq_len and out_vocab == vocab_size, "Unexpected mask prediction output shape"

in model forward torch.Size([3, 2])
IN POSTIONAL FORWARD
POST Postional torch.Size([3, 2, 16])
IN ENCODER FORWARD torch.Size([3, 2, 16]) torch.Size([3, 2, 16])
------- ATTENTION 1 ------
ATTENTION torch.Size([3, 2, 16])
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean (torch.Size([3, 2, 1]))
Standard Deviation  (torch.Size([3, 2, 1]))
y: torch.Size([3, 2, 16])
self.gamma: torch.Size([16]), self.beta: torch.Size([16])
out: torch.Size([3, 2, 16])
------- ATTENTION 2 ------
x after dropout: torch.Size([3, 2, 64])
x after 2nd linear layer: torch.Size([3, 2, 16])
------- DROPOUT 2 ------
------- ADD AND LAYER NORMALIZATION 2 ------
Mean (torch.Size([3, 2, 1]))
Standard Deviation  (torch.Size([3, 2, 1]))
y: torch.Size([3, 2, 16])
self.gamma: torch.Size([16]), self.beta: torch.Size([16])
out: torch.Size([3, 2, 16])
IN ENCODER FORWARD torch.Size([3, 2, 16]) torch.Size([3, 2, 16])
------- ATTENTION 1 ------
ATTENTION torch.Size([3, 2, 16])
------- DROPOUT 1 ------
----

In [101]:
print(mask_preds)


tensor([[[-9.1718e-01, -1.3727e+00,  2.2097e-01, -9.8661e-02,  2.9791e-01,
           4.8143e-01,  7.2774e-01, -2.5986e-01,  2.4598e-01,  4.4886e-01,
          -3.7723e-01, -2.0071e-01,  4.8600e-01, -7.5484e-01,  6.2974e-01,
           2.2242e-01,  1.5447e+00,  1.2410e+00,  1.8637e-01,  5.9945e-01],
         [ 4.1521e-01, -4.8066e-01,  3.6566e-01,  7.2611e-01,  4.3182e-01,
          -5.3013e-02,  6.6377e-01,  3.5730e-01, -1.0911e-02,  5.7580e-01,
          -9.6048e-01, -1.9430e-01,  1.5048e+00,  5.2852e-01, -2.9945e-01,
           1.5301e-01,  7.9827e-01,  6.5249e-01,  1.4291e+00, -3.6569e-01]],

        [[ 3.5886e-01,  1.4730e+00,  2.4403e-01,  2.8965e-01,  3.7539e-01,
          -7.6075e-01,  3.1571e-01,  6.6394e-02, -3.5843e-01, -2.1372e-01,
          -1.0680e+00, -3.1656e-01,  9.7150e-01, -1.2280e-03, -6.9403e-01,
          -3.9616e-01, -9.4375e-02, -3.1369e-01,  4.3219e-01, -4.6102e-01],
         [-9.2675e-01, -4.7492e-01, -4.2623e-01,  1.0391e-01,  8.5269e-01,
           9.0149e-0

In [43]:
# inputs.shape

In [44]:
# inputs

In [17]:
from transformers import BertTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Replace with your desired tokenizer

class MyDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item['text'], truncation=True, padding='max_length', return_tensors='pt', max_length=512)
        encoding['label'] = torch.tensor(item['label'])
        return encoding


dataset = load_dataset("setfit/20_newsgroups")


Repo card metadata block was not found. Setting CardData to empty.


In [18]:
dataset = load_dataset("setfit/20_newsgroups")


Repo card metadata block was not found. Setting CardData to empty.


In [19]:
traindata = MyDataset(dataset=dataset['train'], tokenizer=tokenizer)
testdata = MyDataset(dataset=dataset['test'], tokenizer=tokenizer)  

In [20]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(traindata, batch_size=32, shuffle=True)
test_dataloader = DataLoader(traindata, batch_size=32)

In [25]:
embed_dim = 512
n_heads = 4
n_layers = 2
vocab_size = 30522
attn_drop_rate = 0.1
layer_drop_rate = 0.1
num_classes=20
model = BERT(n_layers, vocab_size, embed_dim, n_heads, num_classes, attn_drop_rate, layer_drop_rate)
num_epochs =1
optimizer = AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1)
        labels = batch['label']
        output = model(input_ids)

        print("HELLO",output.shape)
        loss = criterion(output, labels)  
        loss.backward()  
        optimizer.step()
        break

in model forward torch.Size([32, 512])
IN POSTIONAL FORWARD
POST Postional torch.Size([32, 512, 512])
IN ENCODER FORWARD torch.Size([32, 512, 512]) torch.Size([32, 512, 512])
------- ATTENTION 1 ------
ATTENTION torch.Size([32, 512, 512])
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean (torch.Size([32, 512, 1]))
Standard Deviation  (torch.Size([32, 512, 1]))
y: torch.Size([32, 512, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([32, 512, 512])
------- ATTENTION 2 ------
x after dropout: torch.Size([32, 512, 2048])
x after 2nd linear layer: torch.Size([32, 512, 512])
------- DROPOUT 2 ------
------- ADD AND LAYER NORMALIZATION 2 ------
Mean (torch.Size([32, 512, 1]))
Standard Deviation  (torch.Size([32, 512, 1]))
y: torch.Size([32, 512, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([32, 512, 512])
IN ENCODER FORWARD torch.Size([32, 512, 512]) torch.Size([32, 512, 512])
------- ATTENTION 1 -

In [26]:
loss

tensor(3.0595, grad_fn=<NllLossBackward0>)

In [22]:
len(output[0])

20

In [23]:
output

tensor([[-3.2513e-01, -1.9240e-02, -1.1992e-01,  1.9925e-01, -1.1545e-01,
         -1.6353e-01,  2.2236e-01,  3.9057e-01, -2.3930e-01, -4.1146e-01,
          1.4754e-01,  3.1831e-01, -3.4704e-01,  1.6187e-01,  2.0630e-01,
          4.2818e-01, -5.7798e-01,  3.3938e-01, -6.6719e-02,  1.7823e-01],
        [-2.8443e-01, -1.1484e-01, -8.4109e-02,  9.7996e-02, -1.9036e-01,
         -3.6507e-01,  4.6729e-01,  1.9681e-01, -1.2402e-01, -4.0387e-01,
          1.1493e-01,  1.0444e-01, -6.1762e-02,  2.8009e-01,  9.5937e-02,
          3.2025e-01, -5.4876e-01,  3.6004e-01, -2.3330e-01,  1.0138e-01],
        [-1.6718e-01, -2.0669e-01, -2.3657e-01,  9.6869e-03, -2.1960e-01,
         -3.2149e-01,  4.2171e-01,  8.0840e-02, -1.5137e-01, -4.6460e-01,
          1.3797e-01,  2.3909e-01,  4.9093e-02,  2.0537e-01,  4.0825e-02,
          3.0688e-01, -5.0838e-01,  2.3719e-01, -3.1818e-01,  2.0476e-01],
        [ 4.8505e-02,  1.4172e-01, -9.6209e-02, -2.5069e-01, -1.8217e-01,
          9.8549e-02,  2.6542e-01, 

In [None]:
print(input_ids)
print(labels)


tensor([[ 101, 5875, 1012,  ...,    0,    0,    0],
        [ 101, 2043, 2003,  ...,    0,    0,    0],
        [ 101, 3728, 1010,  ...,    0,    0,    0],
        ...,
        [ 101, 2026, 6685,  ...,    0,    0,    0],
        [ 101, 1034, 1034,  ...,    0,    0,    0],
        [ 101, 1024, 2009,  ...,    0,    0,    0]])
tensor([ 8,  4, 12, 15,  8,  1, 15, 16, 12, 12, 16, 19,  6, 17,  1, 17, 17,  2,
         9,  5,  9, 12, 19,  4,  2,  6,  3,  6,  9,  0,  8,  9])


torch.Size([32])

In [None]:
input_ids.shape

torch.Size([32, 512])

In [None]:
test = train_dataset.__getitem__(0)

In [None]:
test['input_ids'].shape

torch.Size([1, 512])

In [None]:
# seq_len = 512

In [None]:
inputs.shape

torch.Size([3, 2])

In [None]:
inputs

tensor([[9, 4],
        [5, 2],
        [6, 0]])