In [None]:
!pip install transformers peft accelerate bitsandbytes torch -q


In [None]:
!pip install optimum -q

In [None]:
import torch
import time
import numpy as np
import json
import gc
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig
)
from peft import PeftModel, PeftConfig

torch.cuda.empty_cache()
gc.collect()

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Memory available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# **Configuration**

In [None]:
# Mount Google Drive first

# Path to your fine-tuned model in Drive (should be the directory)
MODEL_PATH = "/content/drive/MyDrive/my_finetuned_codegen_model"
# Alternative paths (update to match where you saved):
# MODEL_PATH = "/content/drive/MyDrive/models/codegen_finetuned_lora"
# MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/codegen_finetuned_lora"

# Test configuration
TEST_PROMPTS = [
    "# Write a function to calculate factorial\n",
    "# Create a class for a binary tree\n",
    "# Implement bubble sort\n",
    "# Check if a string is palindrome\n"
]

# Optimization settings
RUN_8BIT_TEST = True
RUN_COMPILE_TEST = True if hasattr(torch, 'compile') else False
RUN_BATCH_TEST = True

print(f"Model path: {MODEL_PATH}")

In [None]:
print("Loading fine-tuned model from Google Drive...")

# Set offline mode to avoid HuggingFace connection
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"

try:
    # Attempt to load the fine-tuned model directly
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )

    print("Loaded fine-tuned model directly from Google Drive")
    is_lora = False

except Exception as e:
    print(f"Error loading model directly: {e}")
    print("Attempting to load as LoRA model by first loading the base model...")
    is_lora = True

    try:
        # Load PEFT config manually from the local path
        import json
        config_path = os.path.join(MODEL_PATH, "adapter_config.json")
        with open(config_path, "r") as f:
            config_data = json.load(f)

        # The base model name
        base_model_name = config_data.get("base_model_name_or_path")
        if base_model_name is None:
            raise ValueError("Could not find 'base_model_name_or_path' in adapter_config.json")
        print(f"Base model: {base_model_name}")


        # Load base model from cache or local files
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
        )

        # Load LoRA weights from Drive
        model = PeftModel.from_pretrained(base_model, MODEL_PATH)
        print("Loaded as LoRA model from Google Drive")
        is_lora = True

    except Exception as e_lora:
        print(f"Error loading as LoRA model: {e_lora}")
        print("Model loading failed using both direct and LoRA methods.")
        is_lora = False


# Load tokenizer from the local path after copying
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if 'model' in locals() and model is not None:
    print(f"Model loaded successfully! Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
else:
    print("Model loading failed.")

In [None]:
def benchmark_inference(model, tokenizer, prompts, num_runs=3, max_tokens=100):
    """Comprehensive benchmark function"""
    times = []
    tokens_generated = []

    # Warmup
    warmup_prompt = prompts[0] if isinstance(prompts, list) else prompts
    inputs = tokenizer(warmup_prompt, return_tensors="pt").to("cuda")
    _ = model.generate(**inputs, max_new_tokens=20)

    # Actual benchmark
    for run in range(num_runs):
        if isinstance(prompts, list):
            # Batch processing
            for prompt in prompts:
                start = time.time()
                inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    temperature=0.1,
                    do_sample=True,
                    pad_token_id=tokenizer.pad_token_id
                )
                torch.cuda.synchronize()
                elapsed = time.time() - start
                times.append(elapsed)
                tokens_generated.append(len(outputs[0]) - len(inputs['input_ids'][0]))
        else:
            # Single prompt
            start = time.time()
            inputs = tokenizer(prompts, return_tensors="pt").to("cuda")
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.1,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
            torch.cuda.synchronize()
            elapsed = time.time() - start
            times.append(elapsed)
            tokens_generated.append(len(outputs[0]) - len(inputs['input_ids'][0]))

    avg_time = np.mean(times)
    avg_tokens = np.mean(tokens_generated)
    tokens_per_sec = avg_tokens / avg_time

    return {
        "avg_time": avg_time,
        "tokens_per_sec": tokens_per_sec,
        "total_tokens": sum(tokens_generated),
        "runs": num_runs,
        "memory_gb": torch.cuda.memory_allocated()/1024**3
    }

def print_benchmark_results(name, results, baseline=None):
    """Pretty print benchmark results"""
    print(f"\n{name}")
    print(f"  Average time: {results['avg_time']:.2f}s")
    print(f"  Tokens/sec: {results['tokens_per_sec']:.1f}")
    print(f"  Memory: {results['memory_gb']:.2f}GB")

    if baseline:
        speedup = results['tokens_per_sec'] / baseline['tokens_per_sec']
        memory_reduction = (baseline['memory_gb'] - results['memory_gb']) / baseline['memory_gb'] * 100
        print(f"  Speedup: {speedup:.2f}x")
        if memory_reduction > 0:
            print(f"  Memory reduction: {memory_reduction:.1f}%")

In [None]:
TEST_PROMPTS = [
    "# Write a function to calculate factorial\n",
    "# Create a class for a binary tree\n",
    "# Implement bubble sort\n",
    "# Check if a string is palindrome\n"
]

In [None]:
RUN_8BIT_TEST = True
RUN_COMPILE_TEST = True if hasattr(torch, 'compile') else False
RUN_BATCH_TEST = True

In [None]:
print("BASELINE PERFORMANCE TEST")
print("="*50)

# Test with single prompt
single_prompt = TEST_PROMPTS[0]
print(f"Testing with: {single_prompt.strip()}")

baseline_results = benchmark_inference(model, tokenizer, single_prompt)
print_benchmark_results("Baseline Performance", baseline_results)

# Save baseline for comparison
all_results = {"baseline": baseline_results}

In [None]:
if is_lora: # Only attempt merge if loaded as LoRA
    print("\nOPTIMIZATION 1: Merge LoRA Weights")
    print("="*50)

    # Merge LoRA weights
    print("Merging LoRA weights...")
    merged_model = model.merge_and_unload()

    # Benchmark merged model
    merge_results = benchmark_inference(merged_model, tokenizer, single_prompt)
    print_benchmark_results("Merged Model", merge_results, baseline_results)
    all_results["merged"] = merge_results

    # Use merged model going forward
    del model
    torch.cuda.empty_cache()
    model = merged_model
    is_lora = False
else:
    print("\nModel already merged or not a LoRA model, skipping LoRA merge test")

In [None]:
if RUN_8BIT_TEST:
    print("\nOPTIMIZATION 2: 8-bit Inference")
    print("="*50)

    # Save current model first
    save_path = "./temp_model"
    # Revert from BetterTransformer before saving if it was applied
    if hasattr(model, 'reverse_bettertransformer'):
        model = model.reverse_bettertransformer()
        print("Reverted model from BetterTransformer state.")

    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # Clear memory
    del model
    torch.cuda.empty_cache()
    gc.collect()

    # Reload in 8-bit
    print("Reloading model in 8-bit...")
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16
    )

    model_8bit = AutoModelForCausalLM.from_pretrained(
        save_path,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    # Benchmark 8-bit model
    bit8_results = benchmark_inference(model_8bit, tokenizer, single_prompt)
    print_benchmark_results("8-bit Model", bit8_results, baseline_results)
    all_results["8bit"] = bit8_results

    # Continue with 8-bit model
    model = model_8bit

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
print("\nOPTIMIZATION 3: Better Transformers")
print("="*50)

try:
    # Try to convert to BetterTransformer
    model_bt = model.to_bettertransformer()

    # Benchmark
    bt_results = benchmark_inference(model_bt, tokenizer, single_prompt)
    print_benchmark_results("BetterTransformer", bt_results, baseline_results)
    all_results["better_transformer"] = bt_results

    model = model_bt
    print("Using BetterTransformer for remaining tests")

except Exception as e:
    print(f"BetterTransformer not supported: {str(e)[:100]}")

In [None]:
if RUN_COMPILE_TEST and hasattr(torch, 'compile'):
    print("\nOPTIMIZATION 4: Torch Compile")
    print("="*50)

    print("Compiling model (this takes 2-3 minutes)...")
    model_compiled = torch.compile(model, mode="reduce-overhead")

    # Warmup compilation
    print("Warming up...")
    _ = model_compiled.generate(
        tokenizer("# test", return_tensors="pt").to("cuda").input_ids,
        max_new_tokens=10
    )

    # Benchmark
    compile_results = benchmark_inference(model_compiled, tokenizer, single_prompt)
    print_benchmark_results("Compiled Model", compile_results, baseline_results)
    all_results["compiled"] = compile_results

    model = model_compiled
else:
    print("\nSkipping Torch Compile (not available or disabled)")

In [None]:
print("\nOPTIMIZATION 5: Optimized Generation Config")
print("="*50)

# Create optimized generation config
gen_config = GenerationConfig(
    max_new_tokens=100,
    temperature=0.1,
    do_sample=True,
    top_p=0.95,
    use_cache=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    num_beams=1,  # Greedy is fastest
    early_stopping=True,
)

# Custom benchmark with generation config
def benchmark_with_config(model, tokenizer, prompt, config, num_runs=3):
    times = []

    for _ in range(num_runs):
        start = time.time()
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        with torch.inference_mode():
            outputs = model.generate(**inputs, generation_config=config)

        torch.cuda.synchronize()
        times.append(time.time() - start)

    return {
        "avg_time": np.mean(times),
        "tokens_per_sec": 100 / np.mean(times),
        "memory_gb": torch.cuda.memory_allocated()/1024**3
    }

config_results = benchmark_with_config(model, tokenizer, single_prompt, gen_config)
print_benchmark_results("Optimized Config", config_results, baseline_results)
all_results["optimized_config"] = config_results

In [None]:
if RUN_BATCH_TEST:
    print("\nOPTIMIZATION 6: Batch Processing")
    print("="*50)

    batch_sizes = [1, 2, 4]

    for batch_size in batch_sizes:
        if batch_size > len(TEST_PROMPTS):
            continue

        print(f"\nTesting batch size: {batch_size}")

        # Create batch
        batch_prompts = TEST_PROMPTS[:batch_size]

        # Time batch processing
        start = time.time()
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to("cuda")

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.1,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )

        torch.cuda.synchronize()
        batch_time = time.time() - start

        tokens_per_prompt = 100
        total_tokens = tokens_per_prompt * batch_size
        tokens_per_sec = total_tokens / batch_time

        print(f"  Total time: {batch_time:.2f}s")
        print(f"  Time per prompt: {batch_time/batch_size:.2f}s")
        print(f"  Tokens/sec (total): {tokens_per_sec:.1f}")
        print(f"  Speedup: {tokens_per_sec/(baseline_results['tokens_per_sec']*batch_size):.2f}x")

        all_results[f"batch_{batch_size}"] = {
            "tokens_per_sec": tokens_per_sec,
            "time_per_prompt": batch_time/batch_size
        }

In [None]:
print("\nOPTIMIZATION SUMMARY")
print("="*60)

# Create comparison table
print(f"\n{'Method':<25} {'Tokens/sec':<12} {'Speedup':<10} {'Memory (GB)':<12}")
print("-"*60)

baseline_tps = baseline_results['tokens_per_sec']

for name, results in all_results.items():
    if isinstance(results, dict) and 'tokens_per_sec' in results:
        tps = results['tokens_per_sec']
        speedup = tps / baseline_tps
        memory = results.get('memory_gb', 'N/A')

        if isinstance(memory, float):
            memory_str = f"{memory:.2f}"
        else:
            memory_str = str(memory)

        print(f"{name:<25} {tps:<12.1f} {speedup:<10.2f}x {memory_str:<12}")

# Find best configuration
best_single = max(
    [(k, v) for k, v in all_results.items() if not k.startswith('batch')],
    key=lambda x: x[1].get('tokens_per_sec', 0)
)

print(f"\nBest single-prompt optimization: {best_single[0]}")
print(f"   Speed: {best_single[1]['tokens_per_sec']:.1f} tokens/sec")
print(f"   Speedup: {best_single[1]['tokens_per_sec']/baseline_tps:.2f}x")