In [3]:
import torch
import torch.nn as nn
import torch.quantization

# 1. Define a smaller, simpler Transformer block
class SmallTransformerBlock(nn.Module):
    def __init__(self, hidden_size, num_heads, intermediate_size):  # Added intermediate_size
        super().__init__()
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, intermediate_size),  # Smaller intermediate layer
            nn.ReLU(),
            nn.Linear(intermediate_size, hidden_size)
        )
        self.norm2 = nn.LayerNorm(hidden_size)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = x + attn_output
        x = self.norm1(x)
        ff_output = self.feed_forward(x)
        x = x + ff_output
        x = self.norm2(x)
        return x

# 2. Create the smaller LLM
class SmallLLM(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, num_heads, intermediate_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.layers = nn.ModuleList([
            SmallTransformerBlock(hidden_size, num_heads, intermediate_size)
            for _ in range(num_layers)
        ])
        self.lm_head = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = self.lm_head(x)
        return x

# 3. Instantiate the model with smaller parameters
vocab_size = 10000  # Example vocabulary size
hidden_size = 128   # Reduced hidden size
num_layers = 4      # Fewer layers
num_heads = 4       # Fewer attention heads
intermediate_size = 256 # Smaller intermediate size in feed-forward layer

model = SmallLLM(vocab_size, hidden_size, num_layers, num_heads, intermediate_size)

# 4. Quantization (Post-Training Dynamic Quantization)
model.eval()  # Important: Set to evaluation mode before quantization
quantized_model = torch.quantization.quantize_dynamic(
    model,  # the original model
    {nn.Linear},  # a set of quantizable modules
    dtype=torch.qint8  # the target dtype for quantized weights
)

# Example Usage (Inference):
input_ids = torch.randint(0, vocab_size, (1, 64)) # Batch size 1, sequence length 64
with torch.no_grad(): # Disable gradients for inference
    outputs = quantized_model(input_ids)
    # Process outputs...


# --- Optional: Pruning (Illustrative) ---
# (Requires a training loop to determine importances)
# import torch.nn.utils.prune as prune

# # Example: Prune 20% of weights in the first linear layer of the first block
# module = model.layers[0].feed_forward[0]
# prune.l1_unstructured(module, name="weight", amount=0.2)
# prune.remove(module, 'weight') # Permanently remove pruned weights

# --- Optional: Export to ONNX or other formats for deployment ---
# torch.onnx.export(quantized_model, (input_ids,), "small_llm.onnx")


print("Model Size (original):", sum(p.numel() for p in model.parameters()))
print("Model Size (quantized):", sum(p.numel() for p in quantized_model.parameters()))

large_model = model
small_model = quantized_model


# You'll likely see a reduction in the number of parameters after quantization.
# The actual memory footprint reduction will be more significant due to int8 storage.

Model Size (original): 3099920
Model Size (quantized): 1546240


In [5]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer  # Or your preferred tokenizer

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",  # Pad to max_length
            truncation=True,        # Truncate if longer than max_length
            max_length=self.max_length,
            return_tensors="pt"     # Return PyTorch tensors
        )
        input_ids = encoding["input_ids"].squeeze() # Remove batch dimension
        labels = input_ids.clone()  # Labels are the same as input_ids for LM
        # Mask padding tokens in labels. -100 is commonly used for this.
        labels[encoding["attention_mask"].squeeze() == 0] = -100
        return {"input_ids": input_ids, "labels": labels}


def create_test_dataloader(texts, tokenizer_name, max_length, batch_size):
    """Creates a DataLoader for the test set."""

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    dataset = TextDataset(texts, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) # No need to shuffle test data
    return dataloader, tokenizer


# Example Usage:
texts = [
    "This is the first test sentence.",
    "Another example sentence for testing.",
    "This is a longer sentence to demonstrate truncation.",  # Example of long sentence
    # ... more test sentences ...
]

tokenizer_name = "bert-base-uncased"  # Or your model's tokenizer
max_length = 128  # Adjust as needed
batch_size = 32 # Adjust based on your memory and needs

test_dataloader, tokenizer = create_test_dataloader(texts, tokenizer_name, max_length, batch_size)


# Now you can use test_dataloader in your evaluation loop:
# for batch in test_dataloader:
#     input_ids = batch['input_ids']
#     labels = batch['labels']
#     # ... (Rest of your evaluation code)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print(f"Using device: {device}")
else:
    device = torch.device("cpu")  # Use CPU
    print(f"Using device: {device}")

Using device: cpu


In [8]:
import torch
import numpy as np

def calculate_perplexity(model, dataloader, device):
    model.eval()
    total_loss = 0
    total_words = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device) # Assuming labels are provided

            outputs = model(input_ids)
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100) # -100 is often padding
            loss = loss_fct(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1))

            total_loss += loss.item() * input_ids.size(0) # Multiply by batch size
            total_words += torch.sum(labels != -100).item() # Count non-padding tokens

    avg_loss = total_loss / total_words
    perplexity = np.exp(avg_loss)
    return perplexity

# Example usage:
perplexity_small = calculate_perplexity(small_model, test_dataloader, device)
perplexity_large = calculate_perplexity(large_model, test_dataloader, device)

print(f"Perplexity (Small Model): {perplexity_small}")
print(f"Perplexity (Large Model): {perplexity_large}")

IndexError: index out of range in self

In [10]:
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM  # Or your preferred model

# 1. Prepare your data
#class TextDataset(Dataset):
    #... (same as before)

# Example usage:
texts = [
    "This is the first test sentence.",
    "Another example sentence for testing.",
    "This is a longer sentence to demonstrate truncation.",
    #... more test sentences...
]
tokenizer_name = "gpt2"  # Example tokenizer
max_length = 128       # Adjust as needed
batch_size = 16        # Adjust as needed
test_dataloader = create_test_dataloader(texts, tokenizer_name, max_length, batch_size)


# 2. Load your language model
model_name = "gpt2"  # Or your fine-tuned model
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode


# 3. Calculate perplexity
def calculate_perplexity(model, dataloader, device):
    model.eval()
    total_loss = 0
    total_words = 0
    loss_fct = CrossEntropyLoss(ignore_index=-100)  # -100 is for padding tokens

    with torch.no_grad():  # No need to calculate gradients
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, labels=labels)  # Get model outputs
            loss = outputs.loss  # Directly get the loss from the model's output

            # If your model doesn't directly return the loss:
            # logits = outputs.logits
            # loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

            total_loss += loss.item() * input_ids.size(0)  # Accumulate loss
            total_words += torch.sum(labels!= -100).item()  # Count non-padding tokens

    avg_loss = total_loss / total_words
    perplexity = torch.exp(torch.tensor(avg_loss))  # Calculate perplexity
    return perplexity.item()  # Return as a Python number

# Calculate and print the perplexity
perplexity = calculate_perplexity(model, test_dataloader, device)
print(f"Perplexity: {perplexity}")

2025-02-20 02:49:17.816622: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9373] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-20 02:49:17.816650: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-20 02:49:17.817627: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1534] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-20 02:49:17.822997: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TypeError: 'DataLoader' object is not subscriptable