# T4-OPT: Quantize Model

This notebook demonstrates model quantization after training.

## Steps:
1. Load trained model with LoRA
2. Merge LoRA adapters
3. Quantize to INT8 or AWQ
4. Export quantized model


In [None]:
import sys
sys.path.append('/content/t4opt')

from quant.merge_lora import merge_lora_weights
from quant.quant_int8 import quantize_to_int8
from quant.quant_awq import quantize_to_awq
from utils.memory import MemoryManager
from utils.checkpoint_utils import print_checkpoint_info, check_drive_checkpoints

# Optional: Mount Google Drive if checkpoints are saved there
# from google.colab import drive
# drive.mount('/content/drive')

MemoryManager.print_memory_summary()


In [None]:
# Paths (adjust based on your training output)
# IMPORTANT: If you saved to Google Drive, use Drive paths!
# Example: lora_path = "/content/drive/MyDrive/t4opt_checkpoints/phi-2-qlora"

base_model_path = "microsoft/phi-2"
lora_path = "./checkpoints/phi-2-qlora"  # ‚ö†Ô∏è Check if this exists after session crash!
merged_output = "./merged_models/phi-2-merged"

# Check if checkpoints exist
print("Checking for checkpoints...")
print_checkpoint_info(lora_path)

# Also check Drive if mounted
drive_info = check_drive_checkpoints()
if drive_info["drive_mounted"] and drive_info["checkpoints"]:
    print("\nüí° Found checkpoints in Drive! Update lora_path to use Drive path.")
    for name in drive_info["checkpoints"].keys():
        print(f"   Example: lora_path = '/content/drive/MyDrive/t4opt_checkpoints/{name}'")

print(f"\nBase model: {base_model_path}")
print(f"LoRA path: {lora_path}")
print(f"Merged output: {merged_output}")


In [None]:
# Merge LoRA adapters
merge_result = merge_lora_weights(
    base_model_path=base_model_path,
    lora_path=lora_path,
    output_path=merged_output
)

print(f"Merged model size: {merge_result['model_size_mb']:.2f} MB")


In [None]:
# Quantize to INT8
int8_result = quantize_to_int8(
    model_path=merged_output,
    context={"output_path": "./quantized_models/phi-2-int8"}
)

print(f"Original size: {int8_result['original_size_mb']:.2f} MB")
print(f"Quantized size: {int8_result['quantized_size_mb']:.2f} MB")
print(f"Size reduction: {int8_result['size_reduction_percent']:.2f}%")


In [None]:
# Optional: Quantize to AWQ (4-bit)
# Note: Requires autoawq library
try:
    awq_result = quantize_to_awq(
        model_path=merged_output,
        context={"output_path": "./quantized_models/phi-2-awq"}
    )
    print("AWQ quantization complete")
except Exception as e:
    print(f"AWQ quantization failed: {e}")
    print("Using NF4 fallback instead")
