In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random

device = 'cuda'
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)

    def forward(self, x, attention_mask=None):
        # 32,87,40
        B,T,C = x.shape
        k = self.key(x)
        # k -> 32,87,10
        q = self.query(x)
        v = self.value(x)
        w = k @ q.transpose(-2, -1)
        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(-1).float() 
            w = w * attention_mask
        w = F.softmax(w, dim=-1)
        out = w @ v
        return out

class MultiHead(nn.Module):
    def __init__(self,head_size,n_heads):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embed,n_embed)
    def forward(self,x,attention_mask):
        out = torch.cat([head(x,attention_mask) for head in self.heads],-1)
        out = self.proj(out)
        return out

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed,n_embed),
            nn.ReLU(),
            nn.Linear(n_embed,n_embed),
            nn.Dropout(dropout),
        )
    def forward(self,x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.multihead = MultiHead(head_size,n_heads)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)
    
    def forward(self,x,attention_mask):
        x = self.ln1(x)
        x = x + self.multihead(x,attention_mask)
        x = self.ln2(x)
        x = x + self.ffwd(x)
        return x


class Encoder(nn.Module):
    def __init__(self,n_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,n_embed)
        self.positional_embedding = nn.Embedding(block_size,n_embed)
        self.blocks = nn.ModuleList([Block() for _ in range(n_layers)])
        self.ln = nn.LayerNorm(n_embed)
        self.cl_head = nn.Sequential(
            nn.Linear(n_embed,6),
        )
    def forward(self,x,attention_mask,targets=None):
        # b,t b=batch, t = tokens
        B,T = x.shape
        ini_emb = self.embedding(x)
        pos_emb = self.positional_embedding(torch.arange(T,device=device))
        x = ini_emb + pos_emb
        # b,t,c = 32,87,40
        for block in self.blocks:
            x = block(x,attention_mask)
        x = self.ln(x)
        x = self.cl_head(x)
        x = x.mean(dim=-2)
        return x

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
import nltk

In [5]:
df = pd.read_csv('data/training.csv',sep=',')
X = df["text"]
y = np.array(df["label"])

In [6]:
from transformers import AutoTokenizer

# Initialize the tokenizer (change "model_name" to the appropriate model)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Encode sentences and generate attention masks
encoded_dict = tokenizer.batch_encode_plus(
    list(X),
    padding=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'  # Return PyTorch tensors
)

# Retrieve encoded sentences and attention masks
input_ids = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
block_size = 87
batch_size = 64
n_embed = 60
n_heads = 5
head_size = 12
n_layers = 2
vocab_size = tokenizer.vocab_size
X = torch.tensor(input_ids,device = device)
y = torch.tensor(y, device = device)
attention_masks = torch.tensor(attention_masks,device=device)

  X = torch.tensor(input_ids,device = device)
  attention_masks = torch.tensor(attention_masks,device=device)


In [8]:
def get_random_batches(X, y, attention_masks, batch_size=32, num_batches=5):
    idx = random.randint(0,len(X) - batch_size)
    return X[idx:idx + batch_size],y[idx:idx + batch_size],attention_masks[idx:idx + batch_size]

In [21]:
model = Encoder(n_classes=6).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.006)

for iter in range(1001):

    # sample a batch of data
    xb, yb,ab = get_random_batches(X,y,attention_masks,batch_size=512)

    # evaluate the loss
    logits = model(xb,ab, F.one_hot(yb,num_classes=6))
    loss = F.cross_entropy(logits,F.one_hot(yb,num_classes=6).type(torch.float32))
    if iter % 100 == 0:
        print(loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

tensor(1.8413, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.9070, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.3533, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0535, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0230, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0832, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0162, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0276, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0593, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0275, device='cuda:0', grad_fn=<DivBackward1>)
tensor(0.0121, device='cuda:0', grad_fn=<DivBackward1>)


In [24]:
xtest = "I'm not happy now i hate this"
xt = tokenizer(xtest,padding="max_length",max_length=87,truncation=True)
model(torch.tensor(xt['input_ids'],device=device).unsqueeze(0),torch.tensor(xt['attention_mask'],device=device).unsqueeze(0))

tensor([[-4.8733,  7.4193,  1.4417,  2.3786, -6.9307, -1.4388]],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [None]:
for iter in range(1001):

    # sample a batch of data
    xb, yb,ab = get_random_batches(X,y,attention_masks,batch_size=128)

    # evaluate the loss
    logits = model(xb,ab, F.one_hot(yb,num_classes=6))
    loss = F.cross_entropy(logits,F.one_hot(yb,num_classes=6).type(torch.float32))
    if iter % 100 == 0:
        print(loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [25]:
xb, yb,ab = get_random_batches(X,y,attention_masks,batch_size=3000)

    # evaluate the loss
logits = model(xb,ab, F.one_hot(yb,num_classes=6))
loss = F.cross_entropy(logits,F.one_hot(yb,num_classes=6).type(torch.float32))
loss

tensor(0.0228, device='cuda:0', grad_fn=<DivBackward1>)

In [171]:
df['label'].unique()

array([0, 3, 2, 5, 4, 1], dtype=int64)

In [55]:
df.iloc[np.where(df['label'] == 1)]

Unnamed: 0,text,label
8,i have been with petronas for years i feel tha...,1
11,i do feel that running is a divine experience ...,1
14,i have immense sympathy with the general point...,1
15,i do not feel reassured anxiety is on each side,1
22,i have the feeling she was amused and delighted,1
...,...,...
15986,i had a horrible horrible horrible time and ho...,1
15987,i feel energized but i find that i am much mor...,1
15990,i feel really glad that i dont look like the c...,1
15993,i most days feel like if braeden and calvin ar...,1


In [54]:
B = 3
T = 3
w = torch.rand((B,T,T))
a = torch.tensor([[0,0,1],[0,1,0],[1,0,0]])


In [57]:
w

tensor([[[0.8488, 0.7205, 0.6193],
         [0.8322, 0.6145, 0.1708],
         [0.5892, 0.1883, 0.7754]],

        [[0.0939, 0.6656, 0.2729],
         [0.1989, 0.7057, 0.7418],
         [0.5035, 0.7337, 0.3003]],

        [[0.4917, 0.6449, 0.6367],
         [0.1367, 0.6154, 0.2820],
         [0.8568, 0.5544, 0.4145]]])

In [35]:
df = pd.read_csv('data/validation.csv',sep=',')
X_test = df["text"]
y_test = torch.tensor(df["label"],device=device)

encoded_dict = tokenizer.batch_encode_plus(
    list(X_test),
    padding=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'  # Return PyTorch tensors
)

# Retrieve encoded sentences and attention masks
input_ids = torch.tensor(encoded_dict['input_ids'],device=device)
attention_masks = torch.tensor(encoded_dict['attention_mask'],device=device)

  input_ids = torch.tensor(encoded_dict['input_ids'],device=device)
  attention_masks = torch.tensor(encoded_dict['attention_mask'],device=device)


In [36]:
o = model(input_ids,attention_masks,y_test)
F.cross_entropy(o,y_test)

tensor(0.7944, device='cuda:0', grad_fn=<NllLossBackward0>)

In [59]:
mask = a.unsqueeze(-1).float()

In [60]:
w_masked = w * mask 

In [61]:
w_masked

tensor([[[0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000],
         [0.5892, 0.1883, 0.7754]],

        [[0.0000, 0.0000, 0.0000],
         [0.1989, 0.7057, 0.7418],
         [0.0000, 0.0000, 0.0000]],

        [[0.4917, 0.6449, 0.6367],
         [0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000]]])

In [39]:
w.shape

torch.Size([32, 87, 87])