In [None]:
!git clone https://github.com/guyd1995/lra-benchmarks.git

Cloning into 'lra-benchmarks'...
remote: Enumerating objects: 154, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 154 (delta 27), reused 26 (delta 26), pack-reused 125[K
Receiving objects: 100% (154/154), 29.88 KiB | 7.47 MiB/s, done.
Resolving deltas: 100% (91/91), done.


In [None]:
!mkdir datasets

In [None]:
%cd datasets

/kaggle/working/datasets


In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2024-03-24 21:05:18--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: 'aclImdb_v1.tar.gz'


2024-03-24 21:05:28 (8.34 MB/s) - 'aclImdb_v1.tar.gz' saved [84125825/84125825]



In [None]:
!tar -xvf aclImdb_v1.tar.gz

In [None]:
import pandas as pd
import re
import glob
import random
import torch
import torch.nn as nn
from transformers import AutoTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup, BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
import numpy as np
import time

In [None]:
neg_files_test = glob.glob('/kaggle/working/datasets/aclImdb/test/neg/*.txt')
pos_files_test = glob.glob('/kaggle/working/datasets/aclImdb/test/pos/*.txt')
neg_files_train = glob.glob('/kaggle/working/datasets/aclImdb/train/neg/*.txt')
pos_files_train = glob.glob('/kaggle/working/datasets/aclImdb/train/pos/*.txt')
train_data = []
test_data = []
for filename in neg_files_train:
  with open(filename, 'r', encoding="utf-8") as f:
    train_data.append((f.read(), 0))

for filename in pos_files_train:
  with open(filename, 'r', encoding="utf-8") as f:
    train_data.append((f.read(), 1))

for filename in neg_files_test:
  with open(filename, 'r', encoding="utf-8") as f:
    test_data.append((f.read(), 0))


for filename in pos_files_test:
  with open(filename, 'r', encoding="utf-8") as f:
    test_data.append((f.read(), 1))



In [None]:
random.shuffle(test_data)
random.shuffle(train_data)
test_data = test_data[:len(test_data)//10]
train_data = test_data[:len(train_data)//10]

In [None]:
test_ = {"Sentence": [], 'class': []}
for text, cls in test_data:
  test_["Sentence"].append(text)
  test_["class"].append(cls)

train_ = {"Sentence": [], 'class': []}
for text, cls in train_data:
  train_["Sentence"].append(text)
  train_["class"].append(cls)

df_train = pd.DataFrame(train_)
df_test = pd.DataFrame(test_)
df = pd.concat([df_train, df_test])
df.shape

(5000, 2)

In [None]:
class SwiGLU(nn.Module):
    def __init__(self):
        super(SwiGLU, self).__init__()

    def forward(self, x):
        out, gate = torch.split(x, split_size_or_sections=2, dim=-1)
        gate = torch.nn.functional.silu(gate)
        x = out * gate
        return x

class CosformerAttention(nn.Module):
    """
    cosformer attention in "cosFormer: Rethinking Softmax In Attention"
    https://arxiv.org/abs/2202.08791
    """
    def __init__(
        self,
        embed_dim,
        num_heads,
        act_fun,
        kdim=None,
        vdim=None,
        dropout_rate=0.0,
        causal=False,
        has_outproj=True
        # act_fun="swiglu",
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if kdim is not None else embed_dim
        self.num_heads = num_heads
        self.has_outproj = has_outproj
        self.act_fun = self.get_act_fun(act_fun)
        # q, k, v projection
        self.k_proj = nn.Linear(self.kdim, embed_dim)
        self.v_proj = nn.Linear(self.vdim, embed_dim)
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        # outprojection
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        # dropout rate
        self.dropout_rate = dropout_rate
        # causal
        self.causal = causal

        assert (self.embed_dim % self.num_heads == 0), "embed_dim must be divisible by num_heads"

    def get_index(self, seq_len):
        index = np.pi / 2 * torch.arange(1, seq_len + 1).reshape(1, -1, 1)

        return nn.Parameter(index, requires_grad=False)

    def get_act_fun(self, act_fun):
        if act_fun == "relu":
            return nn.ReLU()
        elif act_fun == "elu":
            return nn.ELU(inplace=True)
        elif act_fun == "swiglu":  # Добавляем условие для SwiGLU
            return SwiGLU()
        elif act_fun == "silu":
          return nn.SiLU()

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor = None,
        value: torch.Tensor = None,
        attn_mask: torch.Tensor = None,
        eps: float = 1e-6
    ):
        """Input shape: Sequence x Batch x Embedding
        Args:
            query (Tensor): `(L, N, E)` where L is the target sequence length, N is the batch size,
            E is the embedding dimension.
            key (Tensor): `(S, N, E)` where S is the source sequence length, N is the batch size,
            E is the embedding dimension.
            value (Tensor): `(S, N, E)` where S is the source sequence length, N is the batch size,
            E is the embedding dimension.
            attn_mask (Optional[Tensor], optional): typically used to implement causal attention,
            where the mask prevents the attention from looking forward in time (default: None).
        """
        if key is None:
            key = query
        if value is None:
            value = query

        num_heads = self.num_heads
        tgt_len, bsz, embed_dim = query.size()
        src_len = key.size(0)
        head_dim = embed_dim // num_heads

        # get q, k, v
        # (L, N, E)
        q = self.q_proj(query)
        # (S, N, E)
        k = self.k_proj(key)
        # (S, N, E)
        v = self.v_proj(value)

        # activation
        q = self.act_fun(q)
        k = self.act_fun(k)

        # multihead reshape
        # (N * h, L, d)
        q = q.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
        # (N * h, S, d)
        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
        # (N * h, S, d)
        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)

        # cos transform
        m = max(src_len, tgt_len)
        # get index and send to cuda
        weight_index = self.get_index(m).to(q)
        # (N * h, L, 2 * d)
        q_ = torch.cat([q * torch.sin(weight_index[:, :tgt_len, :] / m), q * torch.cos(weight_index[:, :tgt_len, :] / m)], dim=-1)
        # (N * h, S, 2 * d)
        k_ = torch.cat([k * torch.sin(weight_index[:, :src_len, :] / m), k * torch.cos(weight_index[:, :src_len, :] / m)], dim=-1)

        if self.causal:
            # Need to improve speed!
            # (N * h, L, 2 * d) (N * h, L, d) -> (N * h, L, h, 2 * d, d)
            kv_ = torch.einsum("nld,nlm->nldm", k_, v)
            # (N * h, L, 2 * d, d) -> (N * h, L, 2 * d, d)
            kv_cum = torch.cumsum(kv_, dim=1)
            # (N * h, L, 2 * d) (N * h, L, 2 * d, d) -> (N * h, L, d)
            qkv = torch.einsum("nld,nldm->nlm", q_, kv_cum)
            # (N * h, L, 2 * d) -> (N * h, L, 2 * d)
            k_cum = torch.cumsum(k_, dim=1)
            # (N * h, L, 2 * d) (N * h, L, 2 * d) -> (N * h, L)
            denom = torch.clamp_min(torch.einsum("nlm,nlm->nl", q_, k_cum), eps)
            # (N * h, L, d) (N * h, L, 1) -> (N * h, L, d)
            attn_output = qkv / denom.unsqueeze(-1)
            # (N * h, L, d) -> (L, N * h, d) -> (L, N, E)
            attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, -1)
        else:
            # (N * h, L, 2 * d) (N * h, L, d) -> (N * h, 2 * d, d)
            kv_ = torch.einsum('nld,nlm->ndm', k_, v)
            # (N * h, L, 2 * d) (N * h, 2 * d) -> (N * h, L)
            z_ = 1 / torch.clamp_min(torch.einsum('nld,nd->nl', q_, torch.sum(k_, axis=1)), eps)
            # (N * h, L, 2 * d) (N * h, d, 2 * d) (N * h, L) -> (N * h, L, d)
            attn_output = torch.einsum('nld,ndm,nl->nlm', q_, kv_, z_)
            # (N * h, L, d) -> (L, N * h, d) -> (L, N, E)
            attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, -1)
        # L, N, E
        if self.has_outproj:
            attn_output = self.out_proj(attn_output)

        return attn_output

class ReluBERT(nn.Module):
    def __init__(self, num_labels):
        super(ReluBERT, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert =  RobertaModel.from_pretrained("FacebookAI/roberta-base")
        # self.attention = nn.MultiheadAttention(embed_dim=self.bert.config.hidden_size,
        #                                 num_heads=self.bert.config.num_attention_heads)
        self.attention = CosformerAttention(embed_dim=self.bert.config.hidden_size,
                                            num_heads=self.bert.config.num_attention_heads,
                                            act_fun='relu')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


class EluBERT(nn.Module):
    def __init__(self, num_labels):
        super(EluBERT, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert =  RobertaModel.from_pretrained("FacebookAI/roberta-base")
        # self.attention = nn.MultiheadAttention(embed_dim=self.bert.config.hidden_size,
        #                                 num_heads=self.bert.config.num_attention_heads)
        self.attention = CosformerAttention(embed_dim=self.bert.config.hidden_size,
                                            num_heads=self.bert.config.num_attention_heads,
                                            act_fun='elu')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


class SwigluBERT(nn.Module):
    def __init__(self, num_labels):
        super(SwigluBERT, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert =  RobertaModel.from_pretrained("FacebookAI/roberta-base")
        # self.attention = nn.MultiheadAttention(embed_dim=self.bert.config.hidden_size,
        #                                 num_heads=self.bert.config.num_attention_heads)
        self.attention = CosformerAttention(embed_dim=self.bert.config.hidden_size,
                                            num_heads=self.bert.config.num_attention_heads,
                                            act_fun='swiglu')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


class SiluBERT(nn.Module):
    def __init__(self, num_labels):
        super(SiluBERT, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert =  RobertaModel.from_pretrained("FacebookAI/roberta-base")
        # self.attention = nn.MultiheadAttention(embed_dim=self.bert.config.hidden_size,
        #                                 num_heads=self.bert.config.num_attention_heads)
        self.attention = CosformerAttention(embed_dim=self.bert.config.hidden_size,
                                            num_heads=self.bert.config.num_attention_heads,
                                            act_fun='silu')
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


class VanillaBERT(nn.Module):
    def __init__(self, num_labels):
        super(VanillaBERT, self).__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert =  RobertaModel.from_pretrained("FacebookAI/roberta-base")
        # self.attention = nn.MultiheadAttention(embed_dim=self.bert.config.hidden_size,
        #                                 num_heads=self.bert.config.num_attention_heads)
        self.attention = nn.MultiheadAttention(self.bert.config.hidden_size,
                                            self.bert.config.num_attention_heads)
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

def train(model, train_dataloader, val_dataloader, loss, optimizer, scheduler, num_epochs):
    model.train()
    total_train_time = 0.0
    total_val_time = 0.0
    avg_train_loss = 0.0
    val_accuracy = 0.0
    total_val_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        total_loss = 0.0
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch[:2]

            attention_mask = (input_ids != tokenizer.pad_token_id).float()

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)

            loss_comp = loss(outputs, labels)
            total_loss += loss_comp.item()
            loss_comp.backward()
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss}")
        total_train_time += time.time() - start_time


        model.eval()
        val_preds = []
        val_true = []
        start_time = time.time()
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch[:2]

            attention_mask = (input_ids != tokenizer.pad_token_id).float()

            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist()
            val_preds.extend(preds)
            val_true.extend(labels.cpu().numpy().tolist())

        val_accuracy = accuracy_score(val_true, val_preds)
        print(f"Validation Accuracy: {val_accuracy}")
        total_val_time += time.time() - start_time
    avg_train_speed = total_steps / total_train_time
    avg_val_speed = total_steps_val / total_val_time
    print(f"Average Training Speed: {avg_train_speed:.2f} steps/sec")
    print(f"Average Validation Speed: {avg_val_speed:.2f} steps/sec")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")


cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
labels = list(df['class'])  # labels

max_seq_length = 300

tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=max_seq_length, truncation=True) for text in list(df.Sentence)]

max_len = max(len(text) for text in tokenized_texts)
padded_texts = [text + [0]*(max_len - len(text)) for text in tokenized_texts]

input_ids = torch.tensor(padded_texts)
labels = torch.tensor(labels)

train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)

train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = torch.utils.data.TensorDataset(val_inputs, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [None]:
elu_model = EluBERT(num_labels=2)

elu_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(elu_model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 5
total_steps_val = len(val_dataloader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
train(elu_model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs=5)

In [None]:
swiglu_model = SwigluBERT(num_labels=2)

swiglu_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(swiglu_model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 5
total_steps_val = len(val_dataloader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
train(swiglu_model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs=5)

In [None]:
relu_model = ReluBERT(num_labels=2)

relu_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(relu_model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 5
total_steps_val = len(val_dataloader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
train(relu_model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs=5)

In [None]:
silu_model = SiluBERT(num_labels=2)

silu_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(silu_model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 5
total_steps_val = len(val_dataloader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
train(silu_model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, num_epochs=5)