In [None]:
adapter_path = "loraa"

In [None]:
!pip install peft


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
# Install required package

from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel
import torch

# Configuration
base_model = "allenai/led-large-16384"  # Replace with actual model name/path
lora_adapter = "loraa"     # Replace with LoRA adapter directory
merged_model_path = "./merged_model"        # Output directory for merged model
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load and merge models
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)
model_to_merge = PeftModel.from_pretrained(base_model, lora_adapter)
merged_model = model_to_merge.merge_and_unload()

# Save merged model
merged_model.save_pretrained(merged_model_path)
print(f"Successfully saved merged model to: {merged_model_path}")


config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Successfully saved merged model to: ./merged_model


In [None]:
### quantize merged model

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import time
import numpy as np
import os

def quantize_model(model_path, quantized_model_path, quantization_type="fp16", device="cpu"):
    """Quantize a model using specified quantization type"""
    print(f"Loading model for quantization from: {model_path}")

    # For int8/int4 quantization using bitsandbytes
    if quantization_type in ["int8", "int4"]:
        try:
            print(f"Attempting to load model with {quantization_type} quantization...")

            # Check if bitsandbytes is available
            import bitsandbytes as bnb
            from transformers import BitsAndBytesConfig

            # Configure quantization based on type
            if quantization_type == "int8":
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    bnb_8bit_compute_dtype=torch.float16
                )
            else:  # int4
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.float16
                )

            # Load directly with quantization config
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_path,
                quantization_config=quantization_config,
                device_map="auto" if device == "cuda" else None
            )
            tokenizer = AutoTokenizer.from_pretrained(model_path)

            print(f"Successfully loaded model with {quantization_type} quantization")

        except ImportError:
            print(f"bitsandbytes library not found. Falling back to fp16 quantization.")
            # Load model normally and convert to fp16
            model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            model = model.half()  # Convert to fp16

        except Exception as e:
            print(f"Error during {quantization_type} quantization: {e}")
            print("Falling back to fp16 quantization")
            # Load model normally and convert to fp16
            model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            model = model.half()  # Convert to fp16

    # For fp16 quantization (half precision)
    elif quantization_type == "fp16":
        print("Loading model for fp16 quantization...")
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = model.half()  # Convert to fp16
        print("Successfully converted model to fp16")

    else:
        raise ValueError(f"Unsupported quantization type: {quantization_type}")

    # Save the quantized model
    os.makedirs(quantized_model_path, exist_ok=True)
    print(f"Saving quantized model to: {quantized_model_path}")
    model.save_pretrained(quantized_model_path)
    tokenizer.save_pretrained(quantized_model_path)

    print(f"Quantized model saved to: {quantized_model_path}")
    return model, tokenizer

def run_inference(model, tokenizer, input_texts, device="cpu", batch_size=1):
    """Run inference on a list of input texts and measure performance"""
    # Move model to the specified device if not already there
    if device == "cuda" and next(model.parameters()).device.type != "cuda":
        model = model.to(device)
    model.eval()

    total_tokens = 0
    inference_times = []

    # Process inputs in batches
    for i in range(0, len(input_texts), batch_size):
        batch = input_texts[i:i+batch_size]

        # Tokenize inputs
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)

        # Move inputs to device
        if device == "cuda":
            inputs = {k: v.to(device) for k, v in inputs.items()}

        input_token_count = inputs["input_ids"].numel()
        total_tokens += input_token_count

        # Measure inference time
        if device == "cuda":
            torch.cuda.synchronize()
        start_time = time.time()

        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=128)

        if device == "cuda":
            torch.cuda.synchronize()
        end_time = time.time()

        # Convert model outputs to text
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Record inference time
        inference_time = end_time - start_time
        inference_times.append(inference_time)

        print(f"Batch {i//batch_size + 1} inference time: {inference_time:.4f} seconds")
        print(f"Sample output: {decoded_outputs[0][:100]}...")

    # Calculate statistics
    avg_time = np.mean(inference_times) if inference_times else 0
    tokens_per_second = total_tokens / sum(inference_times) if sum(inference_times) > 0 else 0

    print("\nInference Performance Summary:")
    print(f"Average inference time: {avg_time:.4f} seconds per batch")
    print(f"Tokens per second: {tokens_per_second:.2f}")

    return {
        "avg_inference_time": avg_time,
        "tokens_per_second": tokens_per_second,
        "total_tokens": total_tokens,
        "device": device
    }

def compare_model_sizes(original_path, quantized_path):
    """Compare the size of original and quantized models on disk"""
    def get_dir_size(path):
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(path):
            for f in filenames:
                fp = os.path.join(dirpath, f)
                try:
                    total_size += os.path.getsize(fp)
                except OSError as e:
                    print(f"Error accessing {fp}: {e}")
        return total_size / (1024 * 1024)  # Convert to MB

    original_size = get_dir_size(original_path)
    quantized_size = get_dir_size(quantized_path)

    print(f"\nModel Size Comparison:")
    print(f"Original model size: {original_size:.2f} MB")
    print(f"Quantized model size: {quantized_size:.2f} MB")

    if original_size > 0:
        reduction_percentage = (1 - quantized_size/original_size) * 100
        print(f"Size reduction: {reduction_percentage:.2f}%")
    else:
        reduction_percentage = 0
        print("Could not calculate size reduction percentage (original size is 0)")

    return {
        "original_size_mb": original_size,
        "quantized_size_mb": quantized_size,
        "reduction_percentage": reduction_percentage
    }

def load_or_merge_model(base_model_name, lora_adapter_path, merged_model_path):
    """Load a pre-merged model or merge base model with LoRA adapter"""
    print(f"Loading or merging model...")
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)

    # Check if merged model already exists
    if os.path.exists(merged_model_path) and os.path.isfile(os.path.join(merged_model_path, "pytorch_model.bin")):
        print(f"Loading pre-merged model from: {merged_model_path}")
        try:
            model = AutoModelForSeq2SeqLM.from_pretrained(merged_model_path)
            return model, tokenizer
        except Exception as e:
            print(f"Error loading pre-merged model: {e}")
            print("Will attempt to merge models again...")

    # Load base model
    print(f"Loading base model: {base_model_name}")
    try:
        base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
    except Exception as e:
        print(f"Error loading base model: {e}")
        raise

    # Check if LoRA adapter exists and attempt to merge
    if os.path.exists(lora_adapter_path):
        print(f"Loading LoRA adapter: {lora_adapter_path}")
        try:
            # Import PEFT for LoRA
            from peft import PeftModel

            # Load and merge
            model = PeftModel.from_pretrained(base_model, lora_adapter_path)
            print("Merging model with adapter...")
            merged_model = model.merge_and_unload()

            # Save the merged model
            os.makedirs(merged_model_path, exist_ok=True)
            print(f"Saving merged model to: {merged_model_path}")
            merged_model.save_pretrained(merged_model_path)
            tokenizer.save_pretrained(merged_model_path)
            print(f"Merged model saved to: {merged_model_path}")

            return merged_model, tokenizer
        except Exception as e:
            print(f"Error merging models: {e}")
            print("Continuing with base model...")
    else:
        print(f"LoRA adapter path {lora_adapter_path} does not exist, using base model")

    # If we reach here, use the base model
    os.makedirs(merged_model_path, exist_ok=True)
    print(f"Saving base model to: {merged_model_path}")
    base_model.save_pretrained(merged_model_path)
    tokenizer.save_pretrained(merged_model_path)

    return base_model, tokenizer

def main():
    # Configuration
    base_model = "allenai/led-large-16384"  # Base model on HuggingFace
    lora_adapter = "loraa"                  # Path to LoRA adapter
    merged_model_path = "./merged_model"    # Output directory for merged model
    quantized_model_path = "./quantized_model"  # Output directory for quantized model
    device = "cuda" if torch.cuda.is_available() else "cpu"

    print(f"Using device: {device}")

    # Step 1: Load and merge models
    merged_model, tokenizer = load_or_merge_model(base_model, lora_adapter, merged_model_path)

    # Step 2: Quantize the merged model
    # Options: "fp16" (most compatible), "int8" or "int4" (require bitsandbytes)
    # Choose the most appropriate quantization type based on your requirements
    quantized_model, tokenizer = quantize_model(
        merged_model_path, quantized_model_path, quantization_type="fp16", device=device
    )

    # Step 3: Compare model sizes
    size_comparison = compare_model_sizes(merged_model_path, quantized_model_path)

    # Step 4: Run inference tests on both original and quantized models
    print("\n=== Running inference test on original (merged) model ===")
    test_inputs = [
        "This is a test input for the model to summarize: " + " ".join(["content"] * 100),
        "Another test input with different content to process: " + " ".join(["text"] * 100)
    ]

    # Run inference on merged model
    original_perf = run_inference(merged_model, tokenizer, test_inputs, device, batch_size=1)

    print("\n=== Running inference test on quantized model ===")
    # Run inference on quantized model
    quantized_perf = run_inference(quantized_model, tokenizer, test_inputs, device, batch_size=1)

    # Step 5: Report performance comparison
    if original_perf["avg_inference_time"] > 0:
        time_improvement = (original_perf["avg_inference_time"] - quantized_perf["avg_inference_time"]) / original_perf["avg_inference_time"] * 100
    else:
        time_improvement = 0

    if original_perf["tokens_per_second"] > 0:
        throughput_improvement = (quantized_perf["tokens_per_second"] - original_perf["tokens_per_second"]) / original_perf["tokens_per_second"] * 100
    else:
        throughput_improvement = 0

    print("\n=== Performance Comparison ===")
    print(f"Inference time improvement: {time_improvement:.2f}%")
    print(f"Throughput improvement: {throughput_improvement:.2f}%")
    print(f"Size reduction: {size_comparison['reduction_percentage']:.2f}%")

if __name__ == "__main__":
    main()

Using device: cpu
Loading or merging model...
Loading base model: allenai/led-large-16384
Loading LoRA adapter: loraa
Merging model with adapter...
Saving merged model to: ./merged_model
Merged model saved to: ./merged_model
Loading model for quantization from: ./merged_model
Loading model for fp16 quantization...
Successfully converted model to fp16
Saving quantized model to: ./quantized_model
Quantized model saved to: ./quantized_model

Model Size Comparison:
Original model size: 1758.86 MB
Quantized model size: 881.76 MB
Size reduction: 49.87%

=== Running inference test on original (merged) model ===


Input ids are automatically padded from 113 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 111 to 1024 to be a multiple of `config.attention_window`: 1024


Batch 1 inference time: 30.5619 seconds
Sample output: ThisThis is a test input for the model to summarize: content content content content content content...
Batch 2 inference time: 28.5985 seconds
Sample output: AnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAn...

Inference Performance Summary:
Average inference time: 29.5802 seconds per batch
Tokens per second: 3.79

=== Running inference test on quantized model ===
Batch 1 inference time: 100.0436 seconds
Sample output: ThisThis is a test input for the model to summarize: content content content content content content...
Batch 2 inference time: 97.5030 seconds
Sample output: AnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAnotherAn...

Inference Performance Summary:
Average inference time: 98.7733 seconds per batch
Tokens per second: 1.13

=== Performance Comparison ===
Inference time improvement: -233.92%
Throughput improvement: -70.05%

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import time
import numpy as np
import os

def run_inference_comparison(merged_model_path, quantized_model_path):
    """Run inference comparison on both original and quantized models using GPU"""

    # Check for GPU availability
    if not torch.cuda.is_available():
        print("CUDA is not available. Cannot run GPU inference comparison.")
        return

    device = "cuda"
    print(f"Using device: {device}")

    # Load models and tokenizer
    print(f"Loading original model from: {merged_model_path}")
    original_model = AutoModelForSeq2SeqLM.from_pretrained(merged_model_path)
    original_model = original_model.to(device)

    print(f"Loading quantized model from: {quantized_model_path}")
    quantized_model = AutoModelForSeq2SeqLM.from_pretrained(quantized_model_path)
    quantized_model = quantized_model.to(device)

    print(f"Loading tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(merged_model_path)

    # Prepare test inputs
    test_inputs = [
        "This is a test input for the model to summarize: " + " ".join(["content"] * 100),
        "Another test input with different content to process: " + " ".join(["text"] * 100)
    ]

    # Run inference tests
    print("\n=== Running inference test on original model ===")
    original_perf = run_inference(original_model, tokenizer, test_inputs, device, batch_size=1)

    print("\n=== Running inference test on quantized model ===")
    quantized_perf = run_inference(quantized_model, tokenizer, test_inputs, device, batch_size=1)

    # Report performance comparison
    if original_perf["avg_inference_time"] > 0:
        time_improvement = (original_perf["avg_inference_time"] - quantized_perf["avg_inference_time"]) / original_perf["avg_inference_time"] * 100
    else:
        time_improvement = 0

    if original_perf["tokens_per_second"] > 0:
        throughput_improvement = (quantized_perf["tokens_per_second"] - original_perf["tokens_per_second"]) / original_perf["tokens_per_second"] * 100
    else:
        throughput_improvement = 0

    # Calculate memory usage
    with torch.cuda.device(0):
        original_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MB

    # Move original model to CPU to free GPU memory
    original_model = original_model.to('cpu')
    torch.cuda.empty_cache()

    # Calculate memory usage of quantized model
    with torch.cuda.device(0):
        quantized_memory = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MB

    memory_reduction = (1 - quantized_memory/original_memory) * 100 if original_memory > 0 else 0

    print("\n=== Performance Comparison ===")
    print(f"Inference time improvement: {time_improvement:.2f}%")
    print(f"Throughput improvement: {throughput_improvement:.2f}%")
    print(f"GPU Memory usage (original): {original_memory:.2f} MB")
    print(f"GPU Memory usage (quantized): {quantized_memory:.2f} MB")
    print(f"Memory reduction: {memory_reduction:.2f}%")

    return {
        "original_perf": original_perf,
        "quantized_perf": quantized_perf,
        "time_improvement": time_improvement,
        "throughput_improvement": throughput_improvement,
        "memory_reduction": memory_reduction
    }

def run_inference(model, tokenizer, input_texts, device="cuda", batch_size=1):
    """Run inference on a list of input texts and measure performance"""
    model.eval()

    total_tokens = 0
    inference_times = []

    # Warm-up run to avoid cold-start effects
    print("Performing warm-up run...")
    with torch.no_grad():
        warm_input = tokenizer("Warm-up text", return_tensors="pt").to(device)
        model.generate(**warm_input, max_length=20)

    # Process inputs in batches
    for i in range(0, len(input_texts), batch_size):
        batch = input_texts[i:i+batch_size]

        # Tokenize inputs
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        input_token_count = inputs.input_ids.numel()
        total_tokens += input_token_count

        # Measure inference time
        torch.cuda.synchronize()
        start_time = time.time()

        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=128)

        torch.cuda.synchronize()
        end_time = time.time()

        # Convert model outputs to text
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Record inference time
        inference_time = end_time - start_time
        inference_times.append(inference_time)

        print(f"Batch {i//batch_size + 1} inference time: {inference_time:.4f} seconds")
        print(f"Sample output: {decoded_outputs[0][:100]}...")

    # Calculate statistics
    avg_time = np.mean(inference_times) if inference_times else 0
    tokens_per_second = total_tokens / sum(inference_times) if sum(inference_times) > 0 else 0

    print("\nInference Performance Summary:")
    print(f"Average inference time: {avg_time:.4f} seconds per batch")
    print(f"Tokens per second: {tokens_per_second:.2f}")

    return {
        "avg_inference_time": avg_time,
        "tokens_per_second": tokens_per_second,
        "total_tokens": total_tokens,
        "device": device
    }

if __name__ == "__main__":
    # Paths to the models
    merged_model_path = "./merged_model"    # Path to original merged model
    quantized_model_path = "./quantized_model"  # Path to quantized model

    # Run the comparison
    run_inference_comparison(merged_model_path, quantized_model_path)

CUDA is not available. Cannot run GPU inference comparison.


In [None]:
### do DPO on merged model

In [None]:
# prompt: do dpo on merged model

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from trl import DPOTrainer
from datasets import load_dataset

# ... (rest of your existing code)

def dpo_on_merged_model(merged_model_path, output_dir, dataset_name, dataset_config):
    """Performs DPO on a merged model."""

    # Load the merged model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(merged_model_path).to("cuda") # or your preferred device
    tokenizer = AutoTokenizer.from_pretrained(merged_model_path)

    # Load the dataset for DPO training
    dataset = load_dataset(dataset_name, dataset_config, split="train")

    # Prepare the dataset: Ensure your dataset has "prompt", "chosen" and "rejected" columns
    def preprocess_function(examples):
      return tokenizer(examples["prompt"], examples["chosen"], examples["rejected"], truncation=True, padding="max_length", max_length=128) # Adjust max_length

    processed_dataset = dataset.map(
      preprocess_function,
      batched=True,
      remove_columns=dataset.column_names
    )

    # Create the DPOTrainer
    dpo_trainer = DPOTrainer(
        model=model,
        tokenizer=tokenizer,
        args=TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=4,  # Adjust batch size as needed
            gradient_accumulation_steps=4, # Adjust for GPU memory
            num_train_epochs=3,           # Adjust number of epochs
            save_steps=1000,
            logging_steps=100,
            learning_rate=5e-5,          # Adjust learning rate
            fp16=True,                    # Use FP16 if available
            # Add other training arguments as needed
        ),
        train_dataset=processed_dataset,
    )

    # Train the model with DPO
    dpo_trainer.train()

    # Save the fine-tuned model
    dpo_trainer.save_model(output_dir)

#Example usage
dpo_on_merged_model(merged_model_path="./merged_model", output_dir="./dpo_model", dataset_name="your_dataset", dataset_config="your_config")
