In [None]:
import pandas as pd
import re
import unicodedata
import string
import torch
from datasets import Dataset
from collections import Counter
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline

torch.manual_seed(7)
torch.cuda.manual_seed(7)

In [None]:
kantipur = pd.read_csv('/kaggle/input/kantipur-dataset/cleaned_combined_kantipur.tsv',sep='\t', header=None)
kantipur.columns=['title','news']

In [None]:
class NepaliTokenizer:
    def __init__(self):
        # Nepali-specific character ranges and rules
        self.NEPALI_DEVANAGARI_RANGE = (0x0900, 0x097F)
        
        # Punctuation and special characters to handle
        self.NEPALI_PUNCTUATION = r'।॥,\.;:!?\(\)\[\]\{\}'
        
        # Common Nepali suffixes and postpositions to potentially separate
        self.NEPALI_SUFFIXES = [
            'ले', 'को', 'का', 'की', 'के', 
            'मा', 'बाट', 'सँग', 'देखि', 
            'सम्म', 'पछि', 'अघि',
            'हरू'  # Plural marker
        ]
    
    def is_nepali_character(self, char):
        """
        Check if a character is in the Devanagari script range used for Nepali
        
        Args:
        - char: Single character to check
        
        Returns:
        - Boolean indicating if character is Nepali
        """
        if not char:
            return False
        
        # Get the Unicode code point of the character
        code_point = ord(char)
        
        # Check if it falls within Devanagari range
        return (self.NEPALI_DEVANAGARI_RANGE[0] <= code_point <= self.NEPALI_DEVANAGARI_RANGE[1])
    
    def normalize_nepali_text(self, text):
        """
        Normalize Nepali text
        
        Args:
        - text: Input text to normalize
        
        Returns:
        - Normalized text
        """
        
        # Normalize Unicode decomposition
        text = unicodedata.normalize('NFC', text)
        
        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        """
        Advanced Nepali tokenization method
        
        Args:
        - text: Input text to tokenize
        
        Returns:
        - List of tokens
        """
        # Normalize the text first
        text = self.normalize_nepali_text(text)
        
        # Tokenization strategy
        tokens = []
        
        # Current token being built
        current_token = []
        
        # Iterate through characters
        for i, char in enumerate(text):
            # Check if character is Nepali, space, or punctuation
            if self.is_nepali_character(char):
                current_token.append(char)
            elif char.isspace():
                # If we have a current token, add it
                if current_token:
                    tokens.append(''.join(current_token))
                    current_token = []
            elif char in self.NEPALI_PUNCTUATION:
                # Add current token if exists
                if current_token:
                    tokens.append(''.join(current_token))
                    current_token = []
                
                # Add punctuation as separate token
                tokens.append(char)
            else:
                # Non-Nepali characters (like digits, Latin script)
                if current_token:
                    tokens.append(''.join(current_token))
                    current_token = []
                tokens.append(char)
        
        # Add last token if exists
        if current_token:
            tokens.append(''.join(current_token))
        
        # Suffix and postposition handling
        final_tokens = []
        for token in tokens:
            # Check for tokens that can be further split
            if self.is_nepali_character(token[-1]):
                # Check for known suffixes
                for suffix in self.NEPALI_SUFFIXES:
                    if token.endswith(suffix):
                        base_word = token[:-len(suffix)]
                        if base_word:
                            final_tokens.append(base_word)
                            final_tokens.append(suffix)
                            break
                else:
                    final_tokens.append(token)
            else:
                final_tokens.append(token)
        
        return final_tokens

In [None]:
class CustomBERTTokenizer:
    def __init__(self, 
                 max_vocab_size=30000, 
                 max_length=512, 
                 mask_probability=0.15):
        """
        Custom BERT-style tokenizer
        
        Args:
        - max_vocab_size: Maximum number of tokens in vocabulary
        - max_length: Maximum sequence length
        - mask_probability: Probability of masking a token
        """
        self.max_length = max_length
        self.mask_probability = mask_probability
        
        # Special tokens
        self.special_tokens = {
            '[PAD]': 0,
            '[UNK]': 1,
            '[CLS]': 2,
            '[SEP]': 3,
            '[MASK]': 4
        }
        self.special_token_ids = {token: idx for token, idx in self.special_tokens.items()}
        
        # Vocabulary will be built dynamically
        self.vocab = self.special_tokens.copy()
        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
        
        # Keep track of token frequencies
        self.token_freq = Counter()
        
        # Tokenization parameters
        self.max_vocab_size = max_vocab_size
        self.tokenizer = NepaliTokenizer()

    def _tokenize(self, text):
        """
        Basic tokenization method
        
        Args:
        - text: Input text to tokenize
        
        Returns:
        - List of tokens
        """

        tokens = self.tokenizer.tokenize(text)
        
        return tokens

    def build_vocab(self, texts):
        """
        Build vocabulary from corpus
        
        Args:
        - texts: List of texts to build vocabulary from
        """
        # Tokenize all texts
        all_tokens = []
        for text in texts:
            tokens = self._tokenize(text)
            all_tokens.extend(tokens)
            self.token_freq.update(tokens)
        
        # Sort tokens by frequency
        sorted_tokens = sorted(
            self.token_freq.items(), 
            key=lambda x: x[1], 
            reverse=True
        )
        
        # Add most frequent tokens to vocabulary
        next_idx = max(self.special_token_ids.values()) + 1
        for token, _ in sorted_tokens:
            if token not in self.vocab:
                self.vocab[token] = next_idx
                self.reverse_vocab[next_idx] = token
                next_idx += 1
                
                # Stop if we reach max vocab size
                if len(self.vocab) >= self.max_vocab_size:
                    break
    
    def encode(self, text):
        """
        Encode text to token ids
        
        Args:
        - text: Input text
        
        Returns:
        - List of token ids
        """
        
        # Tokenize
        tokens = self._tokenize(text)
    
        # Convert to ids, using [UNK] for out-of-vocab tokens
        token_ids = [
            self.vocab.get(token, self.special_token_ids['[UNK]'])
            for token in tokens
        ]
    
        # Add special tokens
        token_ids = [self.special_token_ids['[CLS]']] + \
                    token_ids + \
                    [self.special_token_ids['[SEP]']]
    
        # Truncate or pad to max_length
        token_ids = token_ids[:self.max_length]
        token_ids += [self.special_token_ids['[PAD]']] * (self.max_length - len(token_ids))
    
        return token_ids

    def mask_tokens(self, input_ids):
        """
        Apply token masking
        
        Args:
        - input_ids: Original token sequence
        
        Returns:
        - masked_input_ids: Input with some tokens masked
        - mask_labels: Original tokens before masking
        """
        # Ensure input_ids is a torch tensor
        if not isinstance(input_ids, torch.Tensor):
            input_ids = torch.tensor(input_ids)
        
        # Create a copy of input_ids
        masked_input_ids = input_ids.clone()
        
        # Create mask for tokens to be masked (excluding special tokens)
        mask = torch.bernoulli(torch.full(masked_input_ids.shape, self.mask_probability)).bool()
        mask &= (masked_input_ids != self.special_token_ids['[CLS]']) & \
               (masked_input_ids != self.special_token_ids['[SEP]']) & \
               (masked_input_ids != self.special_token_ids['[PAD]'])
        
        # If no tokens are masked, randomly mask at least one
        if not mask.any():
            # Randomly select a non-special token to mask
            non_special_mask = (masked_input_ids != self.special_token_ids['[CLS]']) & \
                               (masked_input_ids != self.special_token_ids['[SEP]']) & \
                               (masked_input_ids != self.special_token_ids['[PAD]'])
            if non_special_mask.any():
                random_index = torch.multinomial(non_special_mask.float(), 1)[0]
                mask[random_index] = True
        
        # Create labels for masked tokens
        mask_labels = torch.zeros_like(masked_input_ids)
        mask_labels[mask] = masked_input_ids[mask]
        
        # 80% of masked tokens are replaced with [MASK]
        mask_mask = mask & (torch.rand_like(masked_input_ids.float()) < 0.8)
        masked_input_ids[mask_mask] = self.special_token_ids['[MASK]']
        
        # 10% of masked tokens are replaced with random tokens
        if mask.any():
            random_tokens = torch.randint_like(
                masked_input_ids, 
                0, 
                len(self.vocab)
            )
            random_mask = mask & (torch.rand_like(masked_input_ids.float()) < 0.1)
            masked_input_ids[random_mask] = random_tokens[random_mask]
        
        return masked_input_ids, mask_labels

    def prepare_bert_pretraining_data(self, df, text_column):
        """
        Prepare BERT pretraining data from a DataFrame
        
        Args:
        - df: Input DataFrame
        - text_column: Name of the text column
        
        Returns:
        - Tuple of tensors for pretraining, now with masked_tokens
        """
        # First, build vocabulary
        self.build_vocab(df[text_column])
        
        # Prepare lists to store data
        input_sequences = []
        segment_ids = []
        masked_tokens = []
        
        # Iterate through the DataFrame
        for i in range(len(df)):
            try:
                text1 = df[text_column].iloc[i]
                
                input_ids1 = self.encode(text1)
                
                # Combine texts with segment ids
                combined_input_ids = input_ids1
                segment_ids_tensor = torch.zeros(len(combined_input_ids), dtype=torch.long)
                segment_ids_tensor[len(input_ids1):] = 1

                #AFTER GPU ON
                masked_input_ids, mask_label = self.mask_tokens(combined_input_ids)
                # Append to lists
                input_sequences.append(torch.tensor(combined_input_ids).clone().detach())
                segment_ids.append(segment_ids_tensor.clone().detach())
                masked_tokens.append(mask_label.clone().detach())

            except Exception as e:
                print(f"Error processing row {i}: {e}")
                continue
        
        # Convert to tensors
        input_sequences = torch.stack(input_sequences)
        segment_ids = torch.stack(segment_ids)
        masked_tokens = torch.stack(masked_tokens)
        
        return input_sequences, segment_ids, masked_tokens

In [None]:
class PretrainingDataset(Dataset):
    def __init__(self, input_sequences, segment_ids, masked_tokens):
        self.input_sequences = input_sequences
        self.segment_ids = segment_ids
        self.masked_tokens = masked_tokens
    def __len__(self):
        return len(self.input_sequences)
    
    def __getitem__(self, idx):
        return {
            'input_sequences':torch.tensor(self.input_sequences[idx], dtype=torch.long),
            'segment_ids':torch.tensor(self.segment_ids[idx], dtype=torch.long),
            'masked_tokens':torch.tensor(self.masked_tokens[idx],  dtype=torch.long)
         }

In [None]:
tokenizer = CustomBERTTokenizer()

input_sequences, segment_ids, masked_tokens= tokenizer.prepare_bert_pretraining_data(kantipur, 'news')

In [None]:
len(segment_ids)

In [None]:
train_size = int(0.8*len(input_sequences))
test_size = int(0.2*len(input_sequences))
print(f"Train_size= {train_size}")
print(f"Test_size= {test_size}")

In [None]:
train_dataset = PretrainingDataset(input_sequences[:train_size], segment_ids[:train_size], masked_tokens[:train_size])
test_dataset = PretrainingDataset(input_sequences[train_size:], segment_ids[train_size:], masked_tokens[train_size:])

In [None]:
len(train_dataset)

In [None]:
len(test_dataset)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True,pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True,pin_memory=True)

# Model Architecture

In [None]:
class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, n_segments, max_len, embed_dim, dropout):
        super().__init__()
        self.tok_embed = nn.Embedding(vocab_size, embed_dim)  # Token embedding
        self.seg_embed = nn.Embedding(n_segments, embed_dim)  # Segment embedding
        self.pos_embed = nn.Embedding(max_len, embed_dim)     # Positional embedding
        self.drop = nn.Dropout(dropout)
        self.max_len = max_len  # Store max length for positional embedding

    def forward(self, seq, seg):
        # Dynamically generate position indices on the same device as `seq`
        pos_inp = torch.arange(seq.size(1), device=seq.device).unsqueeze(0).expand_as(seq)
        embed_val = self.tok_embed(seq) + self.seg_embed(seg) + self.pos_embed(pos_inp)
        embed_val = self.drop(embed_val)
        return embed_val



In [None]:
class BERT(nn.Module):
    def __init__(self,
                 vocab_size,
                 n_segments,
                 max_len,
                 embed_dim,
                 n_layers,
                 attn_heads,
                 dropout):
        super().__init__()
        self.embedding = BERTEmbedding(vocab_size, n_segments, max_len, embed_dim, dropout)
        self.encoder_layer = nn.TransformerEncoderLayer(embed_dim, attn_heads, embed_dim*4)
        self.encoder_block = nn.TransformerEncoder(self.encoder_layer, n_layers)
    def forward(self, seq, seg):
        out = self.embedding(seq, seg)
        out = self.encoder_block(out)
        return out

In [None]:
class BERTPretrainingModel(nn.Module):
    def __init__(self, bert_model, vocab_size):
        super().__init__()
        self.bert = bert_model
        # MLM head
        self.mlm_head = nn.Linear(
            bert_model.embedding.tok_embed.embedding_dim, vocab_size)

    def forward(self, seq, seg):
        # Get BERT embeddings
        bert_output = self.bert(seq, seg)  # [batch_size, seq_len, embed_dim]

        # MLM prediction for all tokens
        mlm_prediction = self.mlm_head(bert_output)  # [batch_size, seq_len, vocab_size]

        return mlm_prediction

# Training

In [None]:
def train_bert(model, train_dataloader, optimizer, mlm_criterion, device):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_dataloader):
        seq = batch['input_sequences'].to(device)  # [batch_size, seq_len]
        seg = batch['segment_ids'].to(device)      # [batch_size, seq_len]
        masked_tokens = batch['masked_tokens'].to(device)  # [batch_size, seq_len]

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        mlm_predictions = model(seq, seg)  # [batch_size, seq_len, vocab_size]

        # Flatten predictions and targets
        mlm_predictions = mlm_predictions.view(-1, mlm_predictions.size(-1))  # [batch_size * seq_len, vocab_size]
        masked_tokens = masked_tokens.view(-1)  # [batch_size * seq_len]

        # Compute MLM loss
        mlm_loss = mlm_criterion(mlm_predictions, masked_tokens)

        # Backward pass
        mlm_loss.backward()

        # Optimizer step
        optimizer.step()

        # Accumulate loss
        total_train_loss += mlm_loss.item()

    return total_train_loss / len(train_dataloader)

def test_bert(model,test_dataloader,mlm_criterion,device):
    model.eval()
    total_test_loss = 0
    for batch in tqdm(test_dataloader):
        seq = batch['input_sequences'].to(device)  # [batch_size, seq_len]
        seg = batch['segment_ids'].to(device)      # [batch_size, seq_len]
        masked_tokens = batch['masked_tokens'].to(device)  # [batch_size, seq_len]

        # Forward pass
        mlm_predictions = model(seq, seg)  # [batch_size, seq_len, vocab_size]

        # Flatten predictions and targets
        mlm_predictions = mlm_predictions.view(-1, mlm_predictions.size(-1))  # [batch_size * seq_len, vocab_size]
        masked_tokens = masked_tokens.view(-1)  # [batch_size * seq_len]

        # Compute MLM loss
        mlm_loss = mlm_criterion(mlm_predictions, masked_tokens)

        # Accumulate loss
        total_test_loss += mlm_loss.item()

    return total_test_loss / len(test_dataloader)

        


In [None]:
# Hyperparameters
vocab_size = 30522  # Example vocab size
n_segments = 2
max_len = 512
embed_dim = 768
n_layers = 12
attn_heads = 12
dropout = 0.1

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize BERT model
bert_base = BERT(
    vocab_size=vocab_size, 
    n_segments=n_segments, 
    max_len=max_len, 
    embed_dim=embed_dim, 
    n_layers=n_layers, 
    attn_heads=attn_heads, 
    dropout=dropout
)

# Wrap BERT in pretraining model
model = BERTPretrainingModel(bert_base, vocab_size).to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Loss functions]
mlm_criterion = nn.CrossEntropyLoss(ignore_index=0)



# Training loop
num_epochs = 10
train_loss = []
test_loss = []
for epoch in range(num_epochs):
    train_mlm_loss= train_bert(
        model, 
        train_dataloader, 
        optimizer, 
        mlm_criterion, 
        device
    )
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train MLM Loss: {train_mlm_loss:.4f}')
    train_loss.append(train_mlm_loss)
    
    test_mlm_loss= test_bert(
        model, 
        test_dataloader,
        mlm_criterion,
        device
    )
    print(f'Test MLM Loss: {test_mlm_loss:.4f}')
    test_loss.append(test_mlm_loss)
    
# Save the model
torch.save(model, 'model.pth')
torch.save(model.state_dict(), 'model_state_dict.pth')


In [None]:
with open('train_loss.txt', 'w') as traintxt:
    traintxt.write(str(train_loss))

with open('test_loss.txt', 'w') as testtxt:
    testtxt.write(str(test_loss))


In [None]:
plt.plot(train_loss,label='train_loss',color='blue')
plt.plot(test_loss,label='test_loss',color='red')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Epoch vs Loss')
plt.legend()
plt.show()