# Language Model with LORA and QLORA

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split


class LORALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, alpha=1):
        super(LORALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha

        # Original weight and bias of the linear layer
        self.weight = nn.Parameter(torch.Tensor(output_dim, input_dim))
        #print("self.weight Shape:", self.weight.shape)
        self.bias = nn.Parameter(torch.Tensor(output_dim))

        # LORA specific parameters
        self.A = nn.Parameter(torch.Tensor(input_dim, rank))
        self.B = nn.Parameter(torch.Tensor(rank, output_dim))

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        nn.init.zeros_(self.bias)
        nn.init.normal_(self.A, 0, 0.02)
        nn.init.normal_(self.B, 0, 0.02)

    def forward(self, x):
        #print("LORALayer Input Shape:", x.shape)
        
        original_size = x.size()
        batch_size, seq_len, _ = x.shape
        x_flattened = x.reshape(-1, original_size[-1])

        # Compute lora_adjustment for each input in the batch
        lora_adjustment = self.alpha * (x_flattened @ self.A) @ self.B
        lora_adjustment = lora_adjustment.reshape(batch_size, seq_len, -1)

        #print("Adjusted lora_adjustment Shape:", lora_adjustment.shape)
        #print("self.weight Shape:", self.weight.shape)
        
        # Apply linear transformation to x_flattened
        x_transformed = nn.functional.linear(x_flattened, self.weight, self.bias)
        x_transformed = x_transformed.reshape(batch_size, seq_len, -1)

        # Add lora_adjustment to the transformed x
        x = x_transformed + lora_adjustment
        #print("LORALayer Output Shape:", x.shape)

        return x

class QLORALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, alpha=1, quantization_bits=8):
        super(QLORALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha
        self.quantization_bits = quantization_bits

        # Original weight and bias
        self.weight = nn.Parameter(torch.Tensor(output_dim, input_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))

        # QLORA specific parameters
        self.A = nn.Parameter(torch.Tensor(input_dim, rank))
        self.B = nn.Parameter(torch.Tensor(rank, output_dim))

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        nn.init.zeros_(self.bias)
        nn.init.normal_(self.A, 0, 0.02)
        nn.init.normal_(self.B, 0, 0.02)

    def quantize(self, x, num_bits):
        # Implement a simple quantization method
        scale = x.abs().max()
        x_quantized = torch.round(x / scale * (2**num_bits - 1))
        return x_quantized, scale

    def forward(self, x):
        #print("QLORALayer Input Shape:", x.shape)
        original_size = x.size()
        batch_size, seq_len, _ = x.shape
        x_flattened = x.reshape(-1, original_size[-1])

        A_quantized, scale_A = self.quantize(self.A, self.quantization_bits)
        B_quantized, scale_B = self.quantize(self.B, self.quantization_bits)

        # Compute lora_adjustment for each input in the batch
        lora_adjustment = self.alpha * (x_flattened @ (A_quantized / scale_A)) @ (B_quantized / scale_B)
        lora_adjustment = lora_adjustment.reshape(batch_size, seq_len, -1)

        #print("Adjusted lora_adjustment Shape:", lora_adjustment.shape)
        #print("self.weight Shape:", self.weight.shape)

        # Apply linear transformation to x_flattened
        x_transformed = nn.functional.linear(x_flattened, self.weight, self.bias)
        x_transformed = x_transformed.reshape(batch_size, seq_len, -1)

        # Add lora_adjustment to the transformed x
        x = x_transformed + lora_adjustment
        #print("QLORALayer Output Shape:", x.shape)

        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Einsum does the matrix multiplication for query*keys for each training example
        # with every other training example, then sum it up
        attention = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            attention = attention.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(attention / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion, rank):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            LORALayer(embed_size, forward_expansion * embed_size, rank),
            nn.ReLU(),
            LORALayer(forward_expansion * embed_size, embed_size, rank),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class LanguageModelDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length, rank):
        super(LanguageModelDecoder, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        # Adding BatchNorm layers
        self.bn1 = nn.BatchNorm1d(embed_size)
        self.bn2 = nn.BatchNorm1d(embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout, forward_expansion, rank)
                for _ in range(num_layers)
            ]
        )

        # QLORA layers
        self.qlora_feed_forward = nn.Sequential(
            QLORALayer(embed_size, forward_expansion * embed_size, rank),
            nn.ReLU(),
            QLORALayer(forward_expansion * embed_size, embed_size, rank),
        )
        self.use_qlora = False  # Flag to toggle QLORA

        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        # Transpose for BatchNorm, apply batch normalization, and then transpose back
        x = x.transpose(1, 2)
        x = self.bn1(x)
        x = x.transpose(1, 2)

        for layer in self.layers:
            x = layer(x, x, x, trg_mask)
            if self.use_qlora:
                x = self.qlora_feed_forward(x)

        # Transpose for BatchNorm, apply batch normalization, and then transpose back
        x = x.transpose(1, 2)
        x = self.bn2(x)
        x = x.transpose(1, 2)

        out = self.fc_out(x)
        return out

    def toggle_qlora(self, use_qlora: bool):
        self.use_qlora = use_qlora


 

class LanguageModelTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0, max_length=100, rank=16):
        super(LanguageModelTransformer, self).__init__()

        self.decoder = LanguageModelDecoder(
            vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            max_length,
            rank,
        )

    def forward(self, trg):
        trg_mask = self.make_trg_mask(trg)
        out = self.decoder(trg, trg_mask)
        return out

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        ).to(trg.device)

        return trg_mask

class LanguageDataset(Dataset):
    def __init__(self, tokenized_texts, sequence_length):
        self.tokenized_texts = tokenized_texts
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.tokenized_texts) - self.sequence_length

    def __getitem__(self, index):
        return (
            self.tokenized_texts[index:index+self.sequence_length].clone().detach(),
            self.tokenized_texts[index+1:index+self.sequence_length+1].clone().detach()
        )

# Define vocabulary size and dummy data parameters
NUM_WORDS = 1000  # Example vocabulary size
sequence_length = 30  # Sequence length for the LanguageDataset
dummy_data_size = 1000  # Total number of tokens in the dummy dataset

# Generate random tokenized data
tokenized_train_data = torch.randint(high=NUM_WORDS, size=(dummy_data_size,))

# Create the complete dataset
complete_dataset = LanguageDataset(tokenized_train_data, sequence_length)

# Calculate the actual length of the dataset
actual_dataset_length = len(complete_dataset)

# Define the size of your validation set
validation_size = int(0.2 * actual_dataset_length)  # 20% of the dataset
training_size = actual_dataset_length - validation_size

# Split the dataset into training and validation sets
train_dataset, val_dataset = random_split(complete_dataset, [training_size, validation_size])

# Create DataLoaders for the training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the model instance
model = LanguageModelTransformer(
    vocab_size=NUM_WORDS,
    embed_size=256,  # You can adjust these parameters as needed
    num_layers=6,
    forward_expansion=4,
    heads=8,
    dropout=0,
    max_length=100,
    rank=16
).to(device)



# Enable QLORA during training
model.decoder.toggle_qlora(True)

# Training loop
# Assuming model is an instance of LanguageModelTransformer
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=0.0000001)  # Consider reducing this if still problematic
scheduler = StepLR(optimizer, step_size=1, gamma=0.98)
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()
    model.decoder.toggle_qlora(True)
    total_loss = 0

    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, NUM_WORDS), targets.view(-1))
        
        # Check for NaN in loss
        if math.isnan(loss.item()):
            print("Encountered NaN loss, stopping training")
            break

        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        total_loss += loss.item()

    scheduler.step()

    # Check for NaN in total_loss
    if math.isnan(total_loss):
        print(f"Epoch {epoch+1}/{num_epochs} stopped due to NaN loss")
        break
    else:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

    model.decoder.toggle_qlora(False)




Epoch 1/5, Loss: 7.073467694796049
Epoch 2/5, Loss: 7.075631141662598
Epoch 3/5, Loss: 7.072393857515776
Epoch 4/5, Loss: 7.069617161383996
Epoch 5/5, Loss: 7.0713474200322075


In [23]:
def calculate_perplexity(loss):
    return math.exp(loss)

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            inputs, targets = batch  # Adjust based on your dataset
            inputs, targets = inputs.to(device), targets.to(device)  # Move inputs and targets to the same device as the model

            outputs = model(inputs)
            loss = criterion(outputs.view(-1, NUM_WORDS), targets.view(-1))
            total_loss += loss.item()

    average_loss = total_loss / len(data_loader)
    perplexity = calculate_perplexity(average_loss)

    return average_loss, perplexity

# Define device (if not already defined)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Calculate validation loss and perplexity
val_loss, val_perplexity = evaluate(model, val_loader, device)
print(f"Validation Loss: {val_loss}, Perplexity: {val_perplexity}")


Validation Loss: 8.085800170898438, Perplexity: 3248.017765364175


# TEST ON dATASET 1

In [25]:
!pip install datasets



In [27]:
!pip install apache_beam


Collecting apache_beam
  Downloading apache_beam-2.53.0-cp311-cp311-win_amd64.whl.metadata (6.7 kB)
Collecting crcmod<2.0,>=1.7 (from apache_beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
     ---------------------------------------- 0.0/89.7 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/89.7 kB ? eta -:--:--
     -------------------------- ----------- 61.4/89.7 kB 825.8 kB/s eta 0:00:01
     -------------------------------------- 89.7/89.7 kB 854.1 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting orjson<4,>=3.9.7 (from apache_beam)
  Downloading orjson-3.9.10-cp311-none-win_amd64.whl.metadata (50 kB)
     ---------------------------------------- 0.0/50.5 kB ? eta -:--:--
     -------- ------------------------------- 10.2/50.5 kB ? eta -:--:--
     -------------------------------------- 50.5/50.5 kB 649.0 kB/s eta 0:00:00
Collecting dill<0.3.2,>=0.3.1.1 (from apache_beam)
  Downloading

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
multiprocess 0.70.14 requires dill>=0.3.6, but you have dill 0.3.1.1 which is incompatible.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split
from transformers import BertTokenizer
from datasets import load_dataset


class LORALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, alpha=1):
        super(LORALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha

        # Original weight and bias of the linear layer
        self.weight = nn.Parameter(torch.Tensor(output_dim, input_dim))
        #print("self.weight Shape:", self.weight.shape)
        self.bias = nn.Parameter(torch.Tensor(output_dim))

        # LORA specific parameters
        self.A = nn.Parameter(torch.Tensor(input_dim, rank))
        self.B = nn.Parameter(torch.Tensor(rank, output_dim))

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        nn.init.zeros_(self.bias)
        nn.init.normal_(self.A, 0, 0.02)
        nn.init.normal_(self.B, 0, 0.02)

    def forward(self, x):
        #print("LORALayer Input Shape:", x.shape)
        
        original_size = x.size()
        batch_size, seq_len, _ = x.shape
        x_flattened = x.reshape(-1, original_size[-1])

        # Compute lora_adjustment for each input in the batch
        lora_adjustment = self.alpha * (x_flattened @ self.A) @ self.B
        lora_adjustment = lora_adjustment.reshape(batch_size, seq_len, -1)

        #print("Adjusted lora_adjustment Shape:", lora_adjustment.shape)
        #print("self.weight Shape:", self.weight.shape)
        
        # Apply linear transformation to x_flattened
        x_transformed = nn.functional.linear(x_flattened, self.weight, self.bias)
        x_transformed = x_transformed.reshape(batch_size, seq_len, -1)

        # Add lora_adjustment to the transformed x
        x = x_transformed + lora_adjustment
        #print("LORALayer Output Shape:", x.shape)

        return x

class QLORALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, alpha=1, quantization_bits=4):
        super(QLORALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha
        self.quantization_bits = quantization_bits

        # Original weight and bias
        self.weight = nn.Parameter(torch.Tensor(output_dim, input_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))

        # QLORA specific parameters
        self.A = nn.Parameter(torch.Tensor(input_dim, rank))
        self.B = nn.Parameter(torch.Tensor(rank, output_dim))

        self.dropout = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(output_dim)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        nn.init.zeros_(self.bias)
        nn.init.normal_(self.A, 0, 0.02)
        nn.init.normal_(self.B, 0, 0.02)

    def quantize(self, x, num_bits):
        # Implement a simple quantization method
        scale = x.abs().max()
        x_quantized = torch.round(x / scale * (2**num_bits - 1))
        return x_quantized, scale

    def forward(self, x):
        #print("QLORALayer Input Shape:", x.shape)
        original_size = x.size()
        batch_size, seq_len, _ = x.shape
        x_flattened = x.reshape(-1, original_size[-1])

        A_quantized, scale_A = self.quantize(self.A, self.quantization_bits)
        B_quantized, scale_B = self.quantize(self.B, self.quantization_bits)

        # Compute lora_adjustment for each input in the batch
        lora_adjustment = self.alpha * (x_flattened @ (A_quantized / scale_A)) @ (B_quantized / scale_B)
        lora_adjustment = lora_adjustment.reshape(batch_size, seq_len, -1)
        lora_adjustment = self.dropout(lora_adjustment)
        #print("Adjusted lora_adjustment Shape:", lora_adjustment.shape)
        #print("self.weight Shape:", self.weight.shape)

        # Apply linear transformation to x_flattened
        x_transformed = nn.functional.linear(x_flattened, self.weight, self.bias)
        x_transformed = x_transformed.reshape(batch_size, seq_len, -1)

        # Add lora_adjustment to the transformed x
        x = x_transformed + lora_adjustment
        x = self.layer_norm(x)

        #print("QLORALayer Output Shape:", x.shape)

        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Einsum does the matrix multiplication for query*keys for each training example
        # with every other training example, then sum it up
        attention = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            attention = attention.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(attention / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion, rank):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            LORALayer(embed_size, forward_expansion * embed_size, rank),
            nn.ReLU(),
            LORALayer(forward_expansion * embed_size, embed_size, rank),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class LanguageModelDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length, rank):
        super(LanguageModelDecoder, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        # Adding BatchNorm layers
        self.bn1 = nn.BatchNorm1d(embed_size)
        self.bn2 = nn.BatchNorm1d(embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout, forward_expansion, rank)
                for _ in range(num_layers)
            ]
        )

        # QLORA layers
        self.qlora_feed_forward = nn.Sequential(
            QLORALayer(embed_size, forward_expansion * embed_size, rank),
            nn.ReLU(),
            QLORALayer(forward_expansion * embed_size, embed_size, rank),
        )
        self.use_qlora = False  # Flag to toggle QLORA

        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        # Transpose for BatchNorm, apply batch normalization, and then transpose back
        x = x.transpose(1, 2)
        x = self.bn1(x)
        x = x.transpose(1, 2)

        for layer in self.layers:
            x = layer(x, x, x, trg_mask)
            if self.use_qlora:
                x = self.qlora_feed_forward(x)

        # Transpose for BatchNorm, apply batch normalization, and then transpose back
        x = x.transpose(1, 2)
        x = self.bn2(x)
        x = x.transpose(1, 2)

        out = self.fc_out(x)
        return out

    def toggle_qlora(self, use_qlora: bool):
        self.use_qlora = use_qlora


 

class LanguageModelTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0, max_length=100, rank=16):
        super(LanguageModelTransformer, self).__init__()

        self.decoder = LanguageModelDecoder(
            vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            max_length,
            rank,
        )

    def forward(self, trg):
        trg_mask = self.make_trg_mask(trg)
        out = self.decoder(trg, trg_mask)
        return out

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        ).to(trg.device)

        return trg_mask

# Define vocabulary size and dummy data parameters
NUM_WORDS = 1000  # Example vocabulary size
sequence_length = 30  # Sequence length for the LanguageDataset
dummy_data_size = 1000  # Total number of tokens in the dummy dataset



# Load dataset
dataset = load_dataset('wikipedia', '20220301.simple')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab_size = tokenizer.vocab_size

def tokenize_function(examples):
    # Tokenize the text
    tokenized_output = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=sequence_length)
    
    # Shift input_ids to create labels and truncate the last token
    labels = [seq[1:] + [tokenizer.pad_token_id] for seq in tokenized_output['input_ids']]
    tokenized_output['labels'] = labels
    
    return tokenized_output

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_loader = DataLoader(tokenized_datasets['train'], batch_size=64, shuffle=True)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the model instance
model = LanguageModelTransformer(
    vocab_size=vocab_size,  # Use the vocab size from the tokenizer
    embed_size=256,
    num_layers=6,
    forward_expansion=4,
    heads=8,
    dropout=0,
    max_length=100,
    rank=16
).to(device)



# Enable QLORA during training
model.decoder.toggle_qlora(True)

# Training loop
# Assuming model is an instance of LanguageModelTransformer
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-8, weight_decay=1e-4)
scheduler = StepLR(optimizer, step_size=4, gamma=0.98)
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()
    model.decoder.toggle_qlora(True)
    total_loss = 0

    for batch in train_loader:
        inputs = batch['input_ids'].to(device)
        targets = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        #print("Output shape:", outputs.shape)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        
        # Check for NaN in loss
        if math.isnan(loss.item()):
            print("Encountered NaN loss, stopping training")
            break

        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)

        optimizer.step()
        total_loss += loss.item()

    scheduler.step()

    # Check for NaN in total_loss
    if math.isnan(total_loss):
        print(f"Epoch {epoch+1}/{num_epochs} stopped due to NaN loss")
        break
    else:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

    model.decoder.toggle_qlora(False)




Epoch 1/5, Loss: 10.499467606929352
Epoch 2/5, Loss: 10.49840387818077
Epoch 3/5, Loss: 10.496170871612177
Epoch 4/5, Loss: 10.49417147156456
Epoch 5/5, Loss: 10.492957106852094


# v3

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split
from transformers import BertTokenizer
from datasets import load_dataset


class LORALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, alpha=1):
        super(LORALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha

        # Original weight and bias of the linear layer
        self.weight = nn.Parameter(torch.Tensor(output_dim, input_dim))
        #print("self.weight Shape:", self.weight.shape)
        self.bias = nn.Parameter(torch.Tensor(output_dim))

        # LORA specific parameters
        self.A = nn.Parameter(torch.Tensor(input_dim, rank))
        self.B = nn.Parameter(torch.Tensor(rank, output_dim))

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        nn.init.zeros_(self.bias)
        nn.init.normal_(self.A, 0, 0.02)
        nn.init.normal_(self.B, 0, 0.02)

    def forward(self, x):
        #print("LORALayer Input Shape:", x.shape)
        
        original_size = x.size()
        batch_size, seq_len, _ = x.shape
        x_flattened = x.reshape(-1, original_size[-1])

        # Compute lora_adjustment for each input in the batch
        lora_adjustment = self.alpha * (x_flattened @ self.A) @ self.B
        lora_adjustment = lora_adjustment.reshape(batch_size, seq_len, -1)

        #print("Adjusted lora_adjustment Shape:", lora_adjustment.shape)
        #print("self.weight Shape:", self.weight.shape)
        
        # Apply linear transformation to x_flattened
        x_transformed = nn.functional.linear(x_flattened, self.weight, self.bias)
        x_transformed = x_transformed.reshape(batch_size, seq_len, -1)

        # Add lora_adjustment to the transformed x
        x = x_transformed + lora_adjustment
        #print("LORALayer Output Shape:", x.shape)

        return x

class QLORALayer(nn.Module):
    def __init__(self, input_dim, output_dim, rank, alpha=1, quantization_bits=8):
        super(QLORALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha
        self.quantization_bits = quantization_bits

        # Original weight and bias
        self.weight = nn.Parameter(torch.Tensor(output_dim, input_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))

        # QLORA specific parameters
        self.A = nn.Parameter(torch.Tensor(input_dim, rank))
        self.B = nn.Parameter(torch.Tensor(rank, output_dim))

        self.dropout = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(output_dim)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        nn.init.zeros_(self.bias)
        nn.init.normal_(self.A, 0, 0.02)
        nn.init.normal_(self.B, 0, 0.02)

    def quantize(self, x, num_bits):
        # Implement a simple quantization method
        scale = x.abs().max()
        x_quantized = torch.round(x / scale * (2**num_bits - 1))
        return x_quantized, scale

    def forward(self, x):
        #print("QLORALayer Input Shape:", x.shape)
        original_size = x.size()
        batch_size, seq_len, _ = x.shape
        x_flattened = x.reshape(-1, original_size[-1])

        A_quantized, scale_A = self.quantize(self.A, self.quantization_bits)
        B_quantized, scale_B = self.quantize(self.B, self.quantization_bits)

        # Compute lora_adjustment for each input in the batch
        lora_adjustment = self.alpha * (x_flattened @ (A_quantized / scale_A)) @ (B_quantized / scale_B)
        lora_adjustment = lora_adjustment.reshape(batch_size, seq_len, -1)
        lora_adjustment = self.dropout(lora_adjustment)
        #print("Adjusted lora_adjustment Shape:", lora_adjustment.shape)
        #print("self.weight Shape:", self.weight.shape)

        # Apply linear transformation to x_flattened
        x_transformed = nn.functional.linear(x_flattened, self.weight, self.bias)
        x_transformed = x_transformed.reshape(batch_size, seq_len, -1)

        # Add lora_adjustment to the transformed x
        x = x_transformed + lora_adjustment
        x = self.layer_norm(x)

        #print("QLORALayer Output Shape:", x.shape)

        return x
    
    def update_alpha(self, new_alpha):
        """
        Update the alpha scaling factor.
        """
        self.alpha = new_alpha

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Einsum does the matrix multiplication for query*keys for each training example
        # with every other training example, then sum it up
        attention = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            attention = attention.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(attention / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion, rank):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            LORALayer(embed_size, forward_expansion * embed_size, rank),
            nn.ReLU(),
            LORALayer(forward_expansion * embed_size, embed_size, rank),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class LanguageModelDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length, rank):
        super(LanguageModelDecoder, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        # Adding BatchNorm layers
        self.bn1 = nn.BatchNorm1d(embed_size)
        self.bn2 = nn.BatchNorm1d(embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout, forward_expansion, rank)
                for _ in range(num_layers)
            ]
        )

        # QLORA layers
        self.qlora_feed_forward = nn.Sequential(
            QLORALayer(embed_size, forward_expansion * embed_size, rank),
            nn.ReLU(),
            QLORALayer(forward_expansion * embed_size, embed_size, rank),
        )
        self.use_qlora = False  # Flag to toggle QLORA

        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        # Transpose for BatchNorm, apply batch normalization, and then transpose back
        x = x.transpose(1, 2)
        x = self.bn1(x)
        x = x.transpose(1, 2)

        for layer in self.layers:
            x = layer(x, x, x, trg_mask)
            if self.use_qlora:
                x = self.qlora_feed_forward(x)

        # Transpose for BatchNorm, apply batch normalization, and then transpose back
        x = x.transpose(1, 2)
        x = self.bn2(x)
        x = x.transpose(1, 2)

        out = self.fc_out(x)
        return out

    def toggle_qlora(self, use_qlora: bool):
        self.use_qlora = use_qlora


class LanguageModelTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0, max_length=100, rank=16):
        super(LanguageModelTransformer, self).__init__()

        self.decoder = LanguageModelDecoder(
            vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            max_length,
            rank,
        )

    def forward(self, trg):
        trg_mask = self.make_trg_mask(trg)
        out = self.decoder(trg, trg_mask)
        return out

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        ).to(trg.device)

        return trg_mask

# Define vocabulary size and dummy data parameters
NUM_WORDS = 1000  # Example vocabulary size
sequence_length = 30  # Sequence length for the LanguageDataset
dummy_data_size = 1000  # Total number of tokens in the dummy dataset



# Load dataset
dataset = load_dataset('wikipedia', '20220301.simple')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab_size = tokenizer.vocab_size

def tokenize_function(examples):
    # Tokenize the text
    tokenized_output = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=sequence_length)
    
    # Shift input_ids to create labels and truncate the last token
    labels = [seq[1:] + [tokenizer.pad_token_id] for seq in tokenized_output['input_ids']]
    tokenized_output['labels'] = labels
    
    return tokenized_output

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_loader = DataLoader(tokenized_datasets['train'], batch_size=64, shuffle=True)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the model instance
model = LanguageModelTransformer(
    vocab_size=vocab_size,  # Use the vocab size from the tokenizer
    embed_size=256,
    num_layers=6,
    forward_expansion=4,
    heads=8,
    dropout=0,
    max_length=100,
    rank=16
).to(device)


def calculate_new_alpha(current_loss, initial_loss, initial_alpha=1.0, final_alpha=0.1):
    """
    Calculate a new alpha value based on the current loss.
    """
    if current_loss >= initial_loss:
        return initial_alpha  # Keep initial alpha if loss isn't decreasing

    loss_ratio = current_loss / initial_loss
    alpha_range = initial_alpha - final_alpha
    new_alpha = final_alpha + (alpha_range * loss_ratio)
    return new_alpha

# Enable QLORA during training
model.decoder.toggle_qlora(True)

initial_loss = None
# Training loop
# Assuming model is an instance of LanguageModelTransformer
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-8, weight_decay=1e-4)
scheduler = StepLR(optimizer, step_size=4, gamma=0.98)
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()
    model.decoder.toggle_qlora(True)
    total_loss = 0

    for batch_idx, batch in enumerate(train_loader):
        inputs = batch['input_ids'].to(device)
        targets = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        # Check for NaN in loss
        if math.isnan(loss.item()):
            print("Encountered NaN loss, stopping training")
            break

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

        total_loss += loss.item()

        # Set the initial_loss after the first batch of the first epoch
        if initial_loss is None and batch_idx == 0:
            initial_loss = loss.item()

    scheduler.step()

    # Check for NaN in total_loss
    if math.isnan(total_loss):
        print(f"Epoch {epoch+1}/{num_epochs} stopped due to NaN loss")
        break
    else:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

    # Average loss for the epoch
    average_loss = total_loss / len(train_loader)

    # Update alpha at the end of each epoch based on the average loss
    new_alpha = calculate_new_alpha(average_loss, initial_loss)
    for layer in model.modules():
        if isinstance(layer, QLORALayer):
            layer.update_alpha(new_alpha)

    #model.decoder.toggle_qlora(False)




Map:   0%|          | 0/205328 [00:00<?, ? examples/s]

Epoch 1/5, Loss: 10.485922775375542
Epoch 2/5, Loss: 10.48466368396173
Epoch 3/5, Loss: 10.483034437042265
Epoch 4/5, Loss: 10.481329586246593
Epoch 5/5, Loss: 10.47995846856231


In [3]:
import torch
from datasets import load_dataset

def format_stackexchange_dpo(samples):
    return {
        "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
        "chosen": samples["response_j"],   # Rated better than k
        "rejected": samples["response_k"], # Rated worse than j
    }

# Load and format a subset (30%) of the StackExchange DPO dataset
dataset = load_dataset("lvwerra/stack-exchange-paired")
subset_size = int(0.3 * len(dataset['train']))  # 30% of the dataset
subset_indices = torch.randperm(len(dataset['train'])).tolist()[:subset_size]  # Randomly select indices
formatted_dataset = dataset['train'].select(subset_indices).map(format_stackexchange_dpo, batched=True, load_from_cache_file=False)

# Convert formatted dataset to DataLoader for batch processing
dpo_dataloader = DataLoader(formatted_dataset, batch_size=64, shuffle=True)


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Map:   0%|          | 0/8040549 [00:00<?, ? examples/s]

OSError: [Errno 28] No space left on device

In [None]:
from torch.nn import MarginRankingLoss

# Define DPO-specific loss function
dpo_loss_function = MarginRankingLoss(margin=1.0)
dpo_num_epochs = 2  # Define the number of epochs for DPO training

# DPO Training loop
for epoch in range(dpo_num_epochs):
    model.train()  # Ensure the model is in training mode
    total_dpo_loss = 0

    for batch in dpo_dataloader:
        optimizer.zero_grad()

        # Prepare the input for the model
        prompts = batch['prompt'].to(device)
        preferred_responses = batch['chosen'].to(device)
        less_preferred_responses = batch['rejected'].to(device)

        # Forward pass and model's scoring mechanism for responses
        # The model should output scores for the preferred and less-preferred responses
        output_preferred = model(preferred_responses)
        output_less_preferred = model(less_preferred_responses)

        # Compute DPO loss
        dpo_loss = dpo_loss_function(output_preferred, output_less_preferred, torch.ones(output_preferred.size(0)).to(device))
        total_dpo_loss += dpo_loss.item()

        # Backward pass and optimization
        dpo_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

    print(f"Epoch {epoch+1}/{dpo_num_epochs}, DPO Loss: {total_dpo_loss / len(dpo_dataloader)}")


In [6]:
from torch.utils.data import DataLoader
from torch.nn import MarginRankingLoss
from datasets import load_dataset

def format_stackexchange_dpo(samples):
    """Format StackExchange dataset for DPO."""
    return {
        "prompt": [
            "Question: " + question + "\n\nAnswer: "
            for question in samples["question"]
        ],
        "chosen": samples["response_j"],   # Rated better than k
        "rejected": samples["response_k"], # Rated worse than j
    }

# Load the StackExchange DPO dataset
dataset = load_dataset("lvwerra/stack-exchange-paired")

# Format the dataset for DPO
formatted_dataset = dataset.map(format_stackexchange_dpo, batched=True)

# Display the first few formatted examples (optional)
for i in range(3):
    print(f"Sample {i+1}:")
    print("Prompt:", formatted_dataset['train'][i]['prompt'])
    print("Chosen Answer:", formatted_dataset['train'][i]['chosen'])
    print("Rejected Answer:", formatted_dataset['train'][i]['rejected'])
    print()


# Convert formatted dataset to DataLoader for batch processing
dpo_dataloader = DataLoader(formatted_dataset['train'], batch_size=64, shuffle=True)

# Define DPO-specific loss function
dpo_loss_function = MarginRankingLoss(margin=1.0)

dpo_num_epochs = 2

# DPO Training loop
for epoch in range(dpo_num_epochs):
    model.train()
    total_dpo_loss = 0

    for batch in dpo_dataloader:
        optimizer.zero_grad()

        prompts = batch['prompt'].to(device)
        preferred_responses = batch['chosen'].to(device)
        less_preferred_responses = batch['rejected'].to(device)

        # Forward pass and model's scoring mechanism for responses
        # Modify according to how your model outputs scores
        output_preferred = model(preferred_responses)  # Needs specific implementation
        output_less_preferred = model(less_preferred_responses)  # Needs specific implementation

        # Compute DPO loss
        dpo_loss = dpo_loss_function(output_preferred, output_less_preferred, torch.ones(output_preferred.size(0)).to(device))
        total_dpo_loss += dpo_loss.item()

        # Backward pass and optimization
        dpo_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

    print(f"Epoch {epoch+1}/{dpo_num_epochs}, DPO Loss: {total_dpo_loss / len(dpo_dataloader)}")


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/311M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/26801833 [00:00<?, ? examples/s]

OSError: [Errno 28] No space left on device

In [7]:
from datasets import load_dataset, set_caching_enabled
import torch
from torch.utils.data import DataLoader
from torch.nn import MarginRankingLoss
from transformers import BertTokenizer

# Set a custom cache directory or disable caching
set_caching_enabled(False)

def format_stackexchange_dpo(samples):
    """Format StackExchange dataset for DPO."""
    return {
        "prompt": [
            "Question: " + question + "\n\nAnswer: "
            for question in samples["question"]
        ],
        "chosen": samples["response_j"],   # Rated better than k
        "rejected": samples["response_k"], # Rated worse than j
    }

# Load and format the StackExchange DPO dataset
dataset = load_dataset("lvwerra/stack-exchange-paired")
formatted_dataset = dataset.map(format_stackexchange_dpo, batched=True, load_from_cache_file=False)

# Convert formatted dataset to DataLoader for batch processing
dpo_dataloader = DataLoader(formatted_dataset['train'], batch_size=64, shuffle=True)

# Initialize your model here (ensure it's already fine-tuned)
# Example: model = YourFineTunedModel()

# Define DPO-specific loss function
dpo_loss_function = MarginRankingLoss(margin=1.0)

# Define optimizer for DPO training
# Example: optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

dpo_num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# DPO Training loop
for epoch in range(dpo_num_epochs):
    model.train()
    total_dpo_loss = 0

    for batch in dpo_dataloader:
        optimizer.zero_grad()

        prompts = batch['prompt'].to(device)
        preferred_responses = batch['chosen'].to(device)
        less_preferred_responses = batch['rejected'].to(device)

        # Forward pass and model's scoring mechanism for responses
        # Modify according to how your model outputs scores
        output_preferred = model(preferred_responses)  # Needs specific implementation
        output_less_preferred = model(less_preferred_responses)  # Needs specific implementation

        # Compute DPO loss
        dpo_loss = dpo_loss_function(output_preferred, output_less_preferred, torch.ones(output_preferred.size(0)).to(device))
        total_dpo_loss += dpo_loss.item()

        # Backward pass and optimization
        dpo_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

    print(f"Epoch {epoch+1}/{dpo_num_epochs}, DPO Loss: {total_dpo_loss / len(dpo_dataloader)}")


  set_caching_enabled(False)


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Map:   0%|          | 0/26801833 [00:00<?, ? examples/s]

OSError: [Errno 28] No space left on device

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn import MarginRankingLoss

# Define the formatting function
def format_stackexchange_dpo(samples):
    """Format StackExchange dataset for DPO."""
    return {
        "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
        "chosen": samples["response_j"],   # Rated better than k
        "rejected": samples["response_k"], # Rated worse than j
    }

# Load and format a subset (30%) of the StackExchange DPO dataset
dataset = load_dataset("lvwerra/stack-exchange-paired")
subset_size = int(0.3 * len(dataset['train']))  # 30% of the dataset
subset_indices = torch.randperm(len(dataset['train'])).tolist()[:subset_size]  # Randomly select indices
formatted_dataset = dataset['train'].select(subset_indices).map(format_stackexchange_dpo, batched=True, load_from_cache_file=False)

# Convert the formatted dataset to DataLoader for batch processing
dpo_dataloader = DataLoader(formatted_dataset, batch_size=64, shuffle=True)

# Define DPO-specific loss function
dpo_loss_function = MarginRankingLoss(margin=1.0)
dpo_num_epochs = 2

# Initialize the model and optimizer here

# DPO Training loop
for epoch in range(dpo_num_epochs):
    model.train()
    total_dpo_loss = 0

    for batch in dpo_dataloader:
        optimizer.zero_grad()
        # Your forward pass and loss calculation here

        # Backward pass and optimization
        dpo_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

        total_dpo_loss += dpo_loss.item()

    print(f"Epoch {epoch+1}/{dpo_num_epochs}, DPO Loss: {total_dpo_loss / len(dpo_dataloader)}")


In [None]:
# Assuming DPO dataset is loaded and formatted as dpo_dataset
# and model is already fine-tuned
from datasets import load_dataset

def format_stackexchange_dpo(samples):
    """Format StackExchange dataset for DPO."""
    return {
        "prompt": [
            "Question: " + question + "\n\nAnswer: "
            for question in samples["question"]
        ],
        "chosen": samples["response_j"],   # Rated better than k
        "rejected": samples["response_k"], # Rated worse than j
    }

# Load the StackExchange DPO dataset
dataset = load_dataset("lvwerra/stack-exchange-paired")

# Format the dataset for DPO
formatted_dataset = dataset.map(format_stackexchange_dpo, batched=True)

# Display the first few formatted examples (optional)
for i in range(3):
    print(f"Sample {i+1}:")
    print("Prompt:", formatted_dataset['train'][i]['prompt'])
    print("Chosen Answer:", formatted_dataset['train'][i]['chosen'])
    print("Rejected Answer:", formatted_dataset['train'][i]['rejected'])
    print()

# Define DPO-specific loss function
def dpo_loss_function(model_output_preferred, model_output_less_preferred):
    # Implement a loss function that penalizes the model
    # when preferred response is scored lower than the less-preferred
    # For example, you could use margin ranking loss
    return loss

# DPO Training loop
for epoch in range(dpo_num_epochs):
    model.train()
    total_dpo_loss = 0

    for batch in dpo_dataset:
        optimizer.zero_grad()

        prompts = batch['prompt'].to(device)
        preferred_responses = batch['chosen'].to(device)
        less_preferred_responses = batch['rejected'].to(device)

        # Forward pass for both response types
        output_preferred = model(preferred_responses)
        output_less_preferred = model(less_preferred_responses)

        # Compute DPO loss
        dpo_loss = dpo_loss_function(output_preferred, output_less_preferred)
        total_dpo_loss += dpo_loss.item()

        # Backward pass and optimization
        dpo_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

    print(f"Epoch {epoch+1}/{dpo_num_epochs}, DPO Loss: {total_dpo_loss / len(dpo_dataset)}")
