In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re


# Assuming you have a way to load your vocab mapping (word to index)
# For simplicity, let's pretend we have a vocab dictionary and a reverse_vocab for encoding and decoding
vocab = {"[PAD]": 0, "[UNK]": 1}  # Add the rest of your vocabulary here
reverse_vocab = {v: k for k, v in vocab.items()}
pad_token_id = vocab["[PAD]"]
unk_token_id = vocab["[UNK]"]


class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.pattern = re.compile(r'[\w]+|[^\w\s]')  # Regex to split words and punctuation

    def encode(self, text):
        tokens = self.pattern.findall(text)  # Improved tokenization
        token_ids = []
        for token in tokens:
            subwords = self.find_subwords(token)
            token_ids.extend(subwords)
        return token_ids

    def find_subwords(self, token):
        subwords = []
        i = 0
        while i < len(token):
            found_subword = False
            for j in range(len(token), i, -1):
                subword = token[i:j]
                if subword in self.vocab:
                    subwords.append(self.vocab[subword])
                    i = j
                    found_subword = True
                    break
            if not found_subword:
                subwords.append(unk_token_id)  # Fallback to UNK
                i += 1  # Move to the next character
        return subwords

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_seq_len):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer.encode(text)

        # Padding
        padding_length = self.max_seq_len - len(encoded_text)
        if padding_length > 0:
            encoded_text += [pad_token_id] * padding_length
        else:
            encoded_text = encoded_text[:self.max_seq_len]

        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Adjusting the EmbeddingLayer to not use the Tokenizer's non-existent vocab attribute
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, token_ids):
        return self.embedding(token_ids)

# Correcting TransformerEncoderLayer's forward method to properly use MultiHeadAttention
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout)
        self.ffnn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        src2 = self.norm1(src)
        q, _ = self.self_attn(src2, src2, src2)
        src = src + self.dropout(q)
        src2 = self.norm2(src)
        src = src + self.dropout(self.ffnn(src2))
        return src

# Correction: Pooler squeezes the wrong dimension; it should squeeze dimension 0 (batch dimension is assumed to be 1 here)
class Pooler(nn.Module):
    def __init__(self, d_model):
        super(Pooler, self).__init__()
        self.linear = nn.Linear(d_model, d_model)
        self.activation = nn.Tanh()

    def forward(self, input_tensor):
        # Assuming input_tensor is of shape [batch_size, seq_len, d_model], take the first token's representations
        first_token_tensor = input_tensor[:, 0]
        pooled_output = self.linear(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

# Load pre-trained tokenizer and adjust vocab_size accordingly
tokenizer = Tokenizer(vocab=vocab)

# Assuming vocab_size is the length of your vocab dictionary
vocab_size = len(vocab)
embedding_dim = 128

# Define the model
model = nn.Sequential(
    EmbeddingLayer(vocab_size=vocab_size, embedding_dim=embedding_dim),
    TransformerEncoderLayer(d_model=embedding_dim, nhead=8, dim_feedforward=2048, dropout=0.1),
    Pooler(d_model=embedding_dim)
)

# Correcting the training and evaluation loop
# Load and preprocess data
train_data = pd.read_csv("train.csv")
train_texts = train_data["text"].tolist()
train_labels = train_data["label"].tolist()

# Convert texts and labels into a Dataset and DataLoader
max_seq_len = 512
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_seq_len)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop corrected for proper input handling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(5):
    model.train()
    total_loss = 0
    for token_ids, labels in train_dataloader:
        token_ids, labels = token_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        output = model(token_ids)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")



# v2

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
from collections import Counter
import math

def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def tokenize(text):
    return re.findall(r'\b\w+\b|[\s\.,!?;]', text)

def build_vocab(tokens, max_vocab_size=10000):
    token_freqs = Counter(tokens)
    sorted_tokens = sorted(token_freqs.items(), key=lambda x: (-x[1], x[0]))
    vocab = {"[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3}
    for token, _ in sorted_tokens[:max_vocab_size - len(vocab)]:
        vocab[token] = len(vocab)
    return vocab

def cosine_annealing_scheduler(optimizer, initial_lr, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return 0.5 * (1. + math.cos(math.pi * progress)) 

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

class AdaptiveDropoutLayer(nn.Module):
    def __init__(self, init_dropout_rate=0.1):
        super().__init__()
        self.log_alpha = nn.Parameter(torch.tensor(math.log(init_dropout_rate / (1 - init_dropout_rate))).float())  # Use logit transformation for stability

    def forward(self, x):
        p = torch.sigmoid(self.log_alpha) 
        return nn.functional.dropout(x, p=p, training=self.training) 



def combined_loss(output, target, model, l2_reg_strength=1.0, l1_reg_strength=0.0):
    task_loss = nn.CrossEntropyLoss()(output, target)  
    regularization_loss = 0

    for param in model.parameters():
        if isinstance(param, nn.Parameter):  
            regularization_loss += param.pow(2).sum() * l2_reg_strength  # L2
            regularization_loss += param.abs().sum() * l1_reg_strength  # L1

    return task_loss + regularization_loss

class AdaptiveWeightDecayOptimizer(optim.Optimizer):
    def __init__(self, params, lr=1e-3, init_l2_strength=0.01):
        super().__init__(params, {'lr': lr})
        self.log_l2_strength = nn.Parameter(torch.tensor(math.log(init_l2_strength)).float())

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            weight_decay = torch.exp(self.log_l2_strength)  
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad
                if weight_decay != 0:
                    d_p = d_p.add(p, alpha=weight_decay) 
                p.update(d_p, group['lr']) 

        return loss


class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, embedding_dim):
        super(PositionalEncoding, self).__init__()
        self.positional_embeddings = nn.Parameter(torch.zeros(max_seq_len, embedding_dim), requires_grad=False)
        
        position = torch.arange(0, max_seq_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * -(math.log(10000.0) / embedding_dim))
        
        self.positional_embeddings[:, 0::2] = torch.sin(position * div_term)
        self.positional_embeddings[:, 1::2] = torch.cos(position * div_term)

    def forward(self, x):
        return self.positional_embeddings[:x.size(1), :]

class MultiHeadLinformerAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, k=None):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.k = k if k is not None else embed_dim // num_heads  # Projection dimension

        # Separate projections for each head
        self.key_projections = nn.ModuleList([nn.Linear(embed_dim, self.k) for _ in range(num_heads)])
        self.value_projections = nn.ModuleList([nn.Linear(embed_dim, self.k) for _ in range(num_heads)]) 

        self.out_projection = nn.Linear(self.k * num_heads, embed_dim)

    def forward(self, query):
        seq_len, batch_size, _ = query.size()
        heads = []  # Store output from each head

        for head_idx in range(self.num_heads):
            projected_keys = self.key_projections[head_idx](query)
            projected_values = self.value_projections[head_idx](query)

            # Calculate attention using projected keys and values
            attention = torch.softmax(projected_keys.transpose(2, 3) @ projected_values, dim=-1) 

            out = attention @ projected_values.view(batch_size, seq_len, self.num_heads, self.k)
            out = out.transpose(1, 2).contiguous().view(seq_len, batch_size, self.embed_dim)

            heads.append(out)

        # Concatenate outputs from all heads
        out = torch.cat(heads, dim=-1) 
        out = self.out_projection(out) 
        return out

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_seq_len):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer.encode(text)
        encoded_text = self.dynamic_masking(encoded_text)  # Add this masking call


        # Padding
        padding_length = self.max_seq_len - len(encoded_text)
        attention_mask = [1] * len(encoded_text) + [0] * padding_length
        encoded_text += [self.tokenizer.vocab["[PAD]"]] * padding_length

        
        return {
            "input_ids": torch.tensor(encoded_text, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }
    def dynamic_masking(self, encoded_text):
        for i in range(len(encoded_text)):
            if np.random.rand() < 0.15:  # 15% chance like BERT
                encoded_text[i] = tokenizer.vocab["[MASK]"] 
        return encoded_text

'''
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_seq_len):
        super(EmbeddingLayer, self).__init__()
        self.token_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.positional_embeddings = PositionalEncoding(max_seq_len, embedding_dim)
        
    def forward(self, token_ids):
        token_embeds = self.token_embeddings(token_ids)  # [batch_size, seq_len, embedding_dim]
        position_embeds = self.positional_embeddings(token_embeds)  # [seq_len, embedding_dim]
        return token_embeds + position_embeds
'''


class AdaptiveEmbeddingLayer(nn.Module):
    def __init__(self, vocab, freq_threshold, large_embed_dim, small_embed_dim, max_seq_len):
        super().__init__()
        self.split_vocab(vocab, freq_threshold)

        self.frequent_embeddings = nn.Embedding(len(self.frequent_vocab), large_embed_dim)
        self.infrequent_embeddings = nn.Embedding(len(self.infrequent_vocab), small_embed_dim)

        self.positional_embeddings = PositionalEncoding(max_seq_len, large_embed_dim)  


    def split_vocab(self, vocab, freq_threshold):
        token_counts = [(token, count) for token, count in vocab.items()] 
        token_counts.sort(key=lambda x: -x[1])  # Sort by frequency
        split_point = next(i for i, (_, count) in enumerate(token_counts) if count < freq_threshold)

        self.frequent_vocab = {token: i for i, (token, _) in enumerate(token_counts[:split_point])}
        self.infrequent_vocab = {token: i for i, (token, _) in enumerate(token_counts[split_point:])}

    def forward(self, token_ids):
        token_embeds = []
        for token_id in token_ids:
            if token_id in self.frequent_vocab:
                embed = self.frequent_embeddings(torch.tensor(token_id).long())
            else:
                embed = self.infrequent_embeddings(torch.tensor(token_id).long())
            token_embeds.append(embed)

        token_embeds = torch.stack(token_embeds)
        position_embeds = self.positional_embeddings(token_embeds)
        return token_embeds + position_embeds

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward):
        super(TransformerEncoderLayer, self).__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.attn_layer = MultiHeadLinformerAttention(embed_dim=d_model, num_heads=nhead) 
        self.dropout1 = AdaptiveDropoutLayer()  # After self-attention
        self.norm2 = nn.LayerNorm(d_model)

        self.ffnn = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model),
        ) 

        # Instantiate adaptive dropout layers
        self.dropout2 = AdaptiveDropoutLayer()  # After feed-forward

    def forward(self, src, src_mask=None):
        src1 = self.norm1(src)  # Changed variable names for clarity
        attn_output, _ = self.self_attn(src1, src1, src1, attn_mask=src_mask)
        src = src + self.dropout1(attn_output) 

        src2 = self.norm2(src)
        src = src + self.dropout2(self.ffnn(src2))  
        return src


class Pooler(nn.Module):
    def __init__(self, d_model):
        super(Pooler, self).__init__()
        self.linear = nn.Linear(d_model, d_model)
        self.activation = nn.Tanh()

    def forward(self, input_tensor):
        # Assuming input_tensor is of shape [batch_size, seq_len, d_model], take the first token's representations
        first_token_tensor = input_tensor[:, 0]
        pooled_output = self.linear(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_seq_len, nhead, dim_feedforward, dropout, 
                 freq_threshold, smaller_embed_dim):
        super(TransformerModel, self).__init__()
        self.embedding = AdaptiveEmbeddingLayer(
            vocab=vocab, 
            freq_threshold=freq_threshold,  # You'll need to set this
            large_embed_dim=embedding_dim,       # Or potentially a different size
            small_embed_dim=smaller_embed_dim,   # Smaller dimension for infrequent words
            max_seq_len=max_seq_len
        )        
        self.encoder = TransformerEncoderLayer(embedding_dim, nhead, dim_feedforward, dropout)
        self.pooler = Pooler(embedding_dim)
    
    def forward(self, input_ids, attention_mask=None):
        embedded = self.embedding(input_ids)
        encoded = self.encoder(embedded, src_mask=attention_mask)
        pooled = self.pooler(encoded)
        return pooled


class TrieNode:
    def __init__(self):
        self.children = {}
        self.token_id = None  # Store token IDs for efficient lookup

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, token, token_id):
        node = self.root
        for char in token:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.token_id = token_id

    def find_subwords(self, token):
        node = self.root
        subword_ids = []
        for char in token:
            if char in node.children:
                node = node.children[char]
                if node.token_id is not None:
                    subword_ids.append(node.token_id)
                    break  # Assuming one token maps to one subword for simplicity
            else:
                break  # No further subword match found
        if not subword_ids:  # If no subword was found
            subword_ids.append(unk_token_id)
        return subword_ids

class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.max_subword_length = max(len(token) for token in vocab.keys())
        # Adjust regex to better match BERT's tokenization (e.g., including apostrophes)
        self.pattern = re.compile(r'\b\w+\b|[\s\.,!?;]')

    def _find_subwords(self, word):
        # Simplified simulation of WordPiece tokenization
        subwords = []
        i = 0
        while i < len(word):
            for j in range(self.max_subword_length, 0, -1):
                subword = word[i:i+j]
                if subword in self.vocab:
                    subwords.append(self.vocab[subword])
                    i += j
                    break
            else:
                # If no subword is found, use [UNK]
                subwords.append(self.vocab["[UNK]"])
                i += 1
        return subwords

    def encode(self, text):
        # Add [CLS] token at the beginning
        token_ids = [self.vocab["[CLS]"]]
        tokens = self.pattern.findall(text)
        for token in tokens:
            subword_ids = self._find_subwords(token)
            token_ids.extend(subword_ids)
        # Add [SEP] token at the end
        token_ids.append(self.vocab["[SEP]"])
        return token_ids

# Load corpus and build vocab
corpus = load_corpus("corpus.txt")
tokens = tokenize(corpus)
vocab = build_vocab(tokens)

tokenizer = Tokenizer(vocab=vocab)

# Assuming train_texts and train_labels are defined
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_seq_len=512)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

freq_threshold_values = [10, 50, 100, 200, 500]  
best_validation_accuracy = 0.0 

for freq_threshold in freq_threshold_values:

    # Model instantiation and training setup
    model = TransformerModel(vocab_size=len(vocab), 
                             embedding_dim=128, 
                             max_seq_len=512, 
                             nhead=8, 
                             dim_feedforward=2048,
                             freq_threshold=freq_threshold, 
                             smaller_embed_dim=64).to(device) 

    optimizer = optim.AdamW(model.parameters(), lr=1e-4) 
    meta_optimizer = AdaptiveWeightDecayOptimizer(model.parameters(), lr=1e-5) 
    loss_fn = nn.CrossEntropyLoss()
    meta_update_freq = 5
    # Training loop corrected for model architecture
    for epoch in range(5):
        model.train()
        total_loss = 0

        for i, batch in enumerate(train_dataloader):
            optimizer.zero_grad() 
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            output = model(input_ids, attention_mask)
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step() 

            # Meta-update occasionally 
            if (i + 1) % meta_update_freq == 0:
                meta_optimizer.zero_grad() 
                loss = combined_loss(output, labels, model) 
                loss.backward()
                meta_optimizer.step()  
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")
