# Task 10.7 Solution: Ollama Integration

**Module:** 10 - Large Language Model Fine-Tuning  
**Type:** Solution Notebook

---

This notebook contains complete solutions and working code for deploying a fine-tuned model to Ollama.

## Solution Overview

The complete workflow for deploying a fine-tuned model to Ollama:

1. **Merge LoRA weights** with the base model
2. **Convert to GGUF** format with appropriate quantization
3. **Create Modelfile** with correct chat template
4. **Import to Ollama** and verify
5. **Test and benchmark** the deployed model

In [None]:
# Imports
import os
import json
import subprocess
import time
import gc
from pathlib import Path
from typing import Optional, Dict, List

# Check for optional dependencies
try:
    import torch
    HAS_TORCH = True
except ImportError:
    HAS_TORCH = False
    print("PyTorch not available - merge functions will not work")

try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False
    print("Requests not available - Ollama client will not work")

print(f"PyTorch available: {HAS_TORCH}")
print(f"Requests available: {HAS_REQUESTS}")

## Solution 1: Complete LoRA Weight Merging

A production-ready function for merging LoRA adapters with base models.

In [None]:
def merge_lora_complete(
    base_model_path: str,
    adapter_path: str,
    output_path: str,
    torch_dtype: str = "float16",
    device_map: str = "auto",
    safe_serialization: bool = True,
) -> str:
    """
    Complete solution for merging LoRA weights with base model.
    
    Args:
        base_model_path: HuggingFace model ID or local path
        adapter_path: Path to LoRA adapter directory
        output_path: Where to save the merged model
        torch_dtype: Data type for loading (float16, bfloat16, float32)
        device_map: Device mapping strategy
        safe_serialization: Use safetensors format
    
    Returns:
        Path to merged model
    """
    if not HAS_TORCH:
        raise RuntimeError("PyTorch required for merging")
    
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from peft import PeftModel
    
    # Map string to torch dtype
    dtype_map = {
        "float16": torch.float16,
        "bfloat16": torch.bfloat16,
        "float32": torch.float32,
    }
    dtype = dtype_map.get(torch_dtype, torch.float16)
    
    print(f"\n{'='*60}")
    print("LoRA Weight Merging")
    print(f"{'='*60}")
    print(f"Base model: {base_model_path}")
    print(f"Adapter: {adapter_path}")
    print(f"Output: {output_path}")
    print(f"Dtype: {torch_dtype}")
    
    # Step 1: Load base model
    print("\n[1/5] Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=dtype,
        device_map=device_map,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    param_count = sum(p.numel() for p in base_model.parameters())
    print(f"      Loaded {param_count:,} parameters")
    
    # Step 2: Load tokenizer
    print("\n[2/5] Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    print(f"      Vocab size: {tokenizer.vocab_size:,}")
    
    # Step 3: Load adapter
    print("\n[3/5] Loading LoRA adapter...")
    model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        torch_dtype=dtype,
    )
    
    # Count adapter parameters
    adapter_params = sum(
        p.numel() for n, p in model.named_parameters() 
        if "lora" in n.lower()
    )
    print(f"      Adapter parameters: {adapter_params:,}")
    print(f"      Adapter ratio: {adapter_params/param_count*100:.2f}%")
    
    # Step 4: Merge weights
    print("\n[4/5] Merging weights...")
    model = model.merge_and_unload()
    print("      Merge complete")
    
    # Step 5: Save
    print("\n[5/5] Saving merged model...")
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    
    model.save_pretrained(
        output_path,
        safe_serialization=safe_serialization,
    )
    tokenizer.save_pretrained(output_path)
    
    # Calculate total size
    total_size = sum(
        f.stat().st_size for f in output_path.glob('**/*') 
        if f.is_file()
    )
    print(f"      Saved to: {output_path}")
    print(f"      Total size: {total_size / 1e9:.2f} GB")
    
    # Cleanup
    del model, base_model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print(f"\n{'='*60}")
    print("Merge complete!")
    print(f"{'='*60}")
    
    return str(output_path)

print("Merge function defined. Usage:")
print("  merged_path = merge_lora_complete(")
print("      'meta-llama/Llama-3.1-8B-Instruct',")
print("      './my-lora-adapter',")
print("      './merged-model'")
print("  )")

## Solution 2: GGUF Conversion with Validation

In [None]:
def convert_to_gguf_complete(
    model_path: str,
    output_path: str,
    quantization: str = "Q5_K_M",
    llama_cpp_path: str = "./llama.cpp",
    validate: bool = True,
) -> Optional[str]:
    """
    Complete GGUF conversion solution with validation.
    
    Args:
        model_path: Path to HuggingFace format model
        output_path: Output path for GGUF file
        quantization: Quantization level (F16, Q8_0, Q5_K_M, Q4_K_M, etc.)
        llama_cpp_path: Path to llama.cpp repository
        validate: Whether to validate the output file
    
    Returns:
        Path to GGUF file or None on failure
    """
    print(f"\n{'='*60}")
    print("GGUF Conversion")
    print(f"{'='*60}")
    print(f"Input: {model_path}")
    print(f"Output: {output_path}")
    print(f"Quantization: {quantization}")
    
    # Validate llama.cpp installation
    convert_script = Path(llama_cpp_path) / "convert_hf_to_gguf.py"
    quantize_bin = Path(llama_cpp_path) / "llama-quantize"
    
    if not convert_script.exists():
        print(f"\nError: Conversion script not found at {convert_script}")
        print("Clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp")
        return None
    
    # Step 1: Convert to FP16 GGUF
    print("\n[1/3] Converting to FP16 GGUF...")
    fp16_path = output_path.replace('.gguf', '-f16.gguf')
    
    cmd = [
        "python", str(convert_script),
        model_path,
        "--outfile", fp16_path,
        "--outtype", "f16",
    ]
    
    print(f"      Running: {' '.join(cmd[:4])}...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"      Error: {result.stderr[:500]}")
        return None
    
    fp16_size = os.path.getsize(fp16_path) / 1e9
    print(f"      FP16 GGUF created: {fp16_size:.2f} GB")
    
    # Step 2: Quantize (if not FP16)
    if quantization.upper() != "F16":
        print(f"\n[2/3] Quantizing to {quantization}...")
        
        if not quantize_bin.exists():
            print(f"      Warning: Quantize binary not found at {quantize_bin}")
            print(f"      Returning FP16 version instead")
            return fp16_path
        
        cmd = [str(quantize_bin), fp16_path, output_path, quantization]
        print(f"      Running: llama-quantize ...")
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode != 0:
            print(f"      Error: {result.stderr[:500]}")
            return fp16_path
        
        # Cleanup FP16
        os.remove(fp16_path)
        final_path = output_path
    else:
        final_path = fp16_path
    
    final_size = os.path.getsize(final_path) / 1e9
    print(f"      Final GGUF: {final_size:.2f} GB")
    
    # Step 3: Validate
    if validate:
        print("\n[3/3] Validating GGUF file...")
        
        # Check file header
        with open(final_path, 'rb') as f:
            magic = f.read(4)
            if magic == b'GGUF':
                print("      ✓ Valid GGUF header")
            else:
                print(f"      ✗ Invalid header: {magic}")
                return None
        
        # Check file size is reasonable
        if final_size < 0.1:  # Less than 100MB
            print(f"      ✗ File suspiciously small")
            return None
        
        print("      ✓ File size reasonable")
    
    print(f"\n{'='*60}")
    print(f"Conversion complete: {final_path}")
    print(f"{'='*60}")
    
    return final_path

print("GGUF conversion function defined.")

## Solution 3: Complete Modelfile Generation

In [None]:
def create_modelfile_complete(
    gguf_path: str,
    model_name: str,
    output_path: str = "./Modelfile",
    model_family: str = "llama3",
    system_prompt: str = None,
    temperature: float = 0.7,
    context_length: int = 4096,
) -> str:
    """
    Create a complete Modelfile for Ollama import.
    
    Args:
        gguf_path: Path to GGUF file
        model_name: Name for the model in Ollama
        output_path: Where to save Modelfile
        model_family: Template family (llama3, mistral, etc.)
        system_prompt: Custom system prompt
        temperature: Default temperature
        context_length: Context window size
    
    Returns:
        Path to created Modelfile
    """
    gguf_abs_path = os.path.abspath(gguf_path)
    
    # Default system prompts
    default_prompts = {
        "llama3": "You are a helpful AI assistant.",
        "mistral": "You are a helpful assistant.",
        "code": "You are an expert programmer. Provide clear, working code with explanations.",
    }
    
    if system_prompt is None:
        system_prompt = default_prompts.get(model_family, default_prompts["llama3"])
    
    # Chat templates by model family
    templates = {
        "llama3": '''{{ if .System }}<|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

{{ .Response }}<|eot_id|>''',
        
        "mistral": '''[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST] {{ .Response }}''',
    }
    
    template = templates.get(model_family, templates["llama3"])
    
    # Build Modelfile
    modelfile_content = f'''# Modelfile for {model_name}
# Generated by Ollama Integration Solution
# Model family: {model_family}

# Base model from GGUF file
FROM {gguf_abs_path}

# Model parameters
PARAMETER temperature {temperature}
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER num_ctx {context_length}
PARAMETER num_predict 512
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|end_of_text|>"

# System prompt
SYSTEM """{system_prompt}"""

# Chat template
TEMPLATE """{template}"""

# License
LICENSE """This model is a fine-tuned version of {model_family.title()}.
Please respect the original model's license terms."""
'''
    
    # Save
    with open(output_path, 'w') as f:
        f.write(modelfile_content)
    
    print(f"Modelfile created: {output_path}")
    print(f"\nTo import to Ollama:")
    print(f"  ollama create {model_name} -f {output_path}")
    
    return output_path

# Example
print("Example Modelfile content:")
print("="*50)
create_modelfile_complete(
    "./model.gguf",
    "my-finetuned-model",
    "./example-Modelfile",
    system_prompt="You are an expert AI educator."
)

## Solution 4: Complete Ollama Client with Error Handling

In [None]:
class OllamaClientComplete:
    """
    Complete Ollama client with robust error handling.
    """
    
    def __init__(
        self,
        base_url: str = "http://localhost:11434",
        timeout: int = 120,
    ):
        self.base_url = base_url
        self.timeout = timeout
        self._verify_connection()
    
    def _verify_connection(self) -> bool:
        """Verify Ollama server is running."""
        if not HAS_REQUESTS:
            print("Warning: requests library not available")
            return False
        
        try:
            response = requests.get(
                f"{self.base_url}/api/tags",
                timeout=5
            )
            if response.status_code == 200:
                print(f"Connected to Ollama at {self.base_url}")
                return True
        except requests.exceptions.ConnectionError:
            print(f"Warning: Cannot connect to Ollama at {self.base_url}")
            print("Start Ollama with: ollama serve")
        except Exception as e:
            print(f"Warning: {e}")
        return False
    
    def list_models(self) -> List[Dict]:
        """List all available models."""
        try:
            response = requests.get(
                f"{self.base_url}/api/tags",
                timeout=self.timeout
            )
            response.raise_for_status()
            return response.json().get('models', [])
        except Exception as e:
            print(f"Error listing models: {e}")
            return []
    
    def generate(
        self,
        model: str,
        prompt: str,
        system: str = None,
        **kwargs,
    ) -> Dict:
        """
        Generate a response.
        
        Returns dict with 'response', 'tokens', 'duration' keys.
        """
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": False,
            **kwargs,
        }
        if system:
            payload["system"] = system
        
        start_time = time.time()
        
        try:
            response = requests.post(
                f"{self.base_url}/api/generate",
                json=payload,
                timeout=self.timeout
            )
            response.raise_for_status()
            data = response.json()
            
            return {
                "response": data.get('response', ''),
                "tokens": data.get('eval_count', 0),
                "duration": time.time() - start_time,
                "model": model,
                "success": True,
            }
        except requests.exceptions.Timeout:
            return {
                "response": "",
                "error": f"Timeout after {self.timeout}s",
                "success": False,
            }
        except Exception as e:
            return {
                "response": "",
                "error": str(e),
                "success": False,
            }
    
    def chat(
        self,
        model: str,
        messages: List[Dict],
        **kwargs,
    ) -> Dict:
        """Chat with conversation history."""
        payload = {
            "model": model,
            "messages": messages,
            "stream": False,
            **kwargs,
        }
        
        start_time = time.time()
        
        try:
            response = requests.post(
                f"{self.base_url}/api/chat",
                json=payload,
                timeout=self.timeout
            )
            response.raise_for_status()
            data = response.json()
            
            return {
                "response": data.get('message', {}).get('content', ''),
                "tokens": data.get('eval_count', 0),
                "duration": time.time() - start_time,
                "success": True,
            }
        except Exception as e:
            return {
                "response": "",
                "error": str(e),
                "success": False,
            }
    
    def import_model(
        self,
        model_name: str,
        modelfile_path: str,
    ) -> bool:
        """Import a model from Modelfile."""
        cmd = ["ollama", "create", model_name, "-f", modelfile_path]
        print(f"Running: {' '.join(cmd)}")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print(f"Successfully imported: {model_name}")
            return True
        else:
            print(f"Error: {result.stderr}")
            return False

# Create client (will warn if Ollama not running)
if HAS_REQUESTS:
    client = OllamaClientComplete()
    
    # List models if connected
    models = client.list_models()
    if models:
        print(f"\nAvailable models: {len(models)}")
        for m in models[:5]:
            print(f"  - {m['name']}")
else:
    print("Ollama client requires 'requests' package")

## Solution 5: Complete Benchmarking Suite

In [None]:
def benchmark_model_complete(
    model_name: str,
    client: 'OllamaClientComplete',
    prompts: List[str] = None,
    num_runs: int = 3,
    warmup: bool = True,
) -> Dict:
    """
    Complete benchmarking solution with statistics.
    
    Args:
        model_name: Model to benchmark
        client: OllamaClient instance
        prompts: Test prompts (uses defaults if None)
        num_runs: Number of runs per prompt
        warmup: Whether to do a warmup run first
    
    Returns:
        Dictionary with benchmark results
    """
    if prompts is None:
        prompts = [
            "What is machine learning?",
            "Explain the difference between AI and ML.",
            "Write a Python function to sort a list.",
            "What are the benefits of neural networks?",
        ]
    
    print(f"\n{'='*60}")
    print(f"Benchmarking: {model_name}")
    print(f"{'='*60}")
    print(f"Prompts: {len(prompts)}")
    print(f"Runs per prompt: {num_runs}")
    
    # Warmup
    if warmup:
        print("\nWarmup run...")
        client.generate(model_name, "Hello")
    
    # Collect results
    results = {
        "model": model_name,
        "prompts": len(prompts),
        "runs": num_runs,
        "times": [],
        "tokens": [],
        "tps": [],  # tokens per second
    }
    
    print("\nRunning benchmark...")
    for run in range(num_runs):
        print(f"  Run {run + 1}/{num_runs}")
        for prompt in prompts:
            result = client.generate(model_name, prompt)
            
            if result["success"]:
                results["times"].append(result["duration"])
                results["tokens"].append(result["tokens"])
                if result["duration"] > 0:
                    results["tps"].append(
                        result["tokens"] / result["duration"]
                    )
    
    # Calculate statistics
    if results["times"]:
        import statistics
        
        results["stats"] = {
            "avg_time": statistics.mean(results["times"]),
            "std_time": statistics.stdev(results["times"]) if len(results["times"]) > 1 else 0,
            "min_time": min(results["times"]),
            "max_time": max(results["times"]),
            "avg_tokens": statistics.mean(results["tokens"]),
            "avg_tps": statistics.mean(results["tps"]) if results["tps"] else 0,
        }
        
        print(f"\n{'='*60}")
        print("Results")
        print(f"{'='*60}")
        print(f"Average response time: {results['stats']['avg_time']:.2f}s")
        print(f"Std deviation: {results['stats']['std_time']:.2f}s")
        print(f"Range: {results['stats']['min_time']:.2f}s - {results['stats']['max_time']:.2f}s")
        print(f"Average tokens: {results['stats']['avg_tokens']:.0f}")
        print(f"Tokens/second: {results['stats']['avg_tps']:.1f}")
    else:
        print("\nNo successful runs - check model and connection")
    
    return results

print("Benchmark function defined.")
print("\nUsage:")
print("  results = benchmark_model_complete('my-model', client)")

## Solution 6: Complete Deployment Pipeline

In [None]:
def deploy_to_ollama(
    base_model: str,
    adapter_path: str,
    model_name: str,
    quantization: str = "Q5_K_M",
    work_dir: str = "./ollama_deploy",
    llama_cpp_path: str = "./llama.cpp",
    cleanup_intermediate: bool = True,
) -> bool:
    """
    Complete end-to-end deployment pipeline.
    
    Args:
        base_model: HuggingFace model ID
        adapter_path: Path to LoRA adapter
        model_name: Name for Ollama model
        quantization: GGUF quantization level
        work_dir: Working directory for intermediate files
        llama_cpp_path: Path to llama.cpp
        cleanup_intermediate: Delete intermediate files
    
    Returns:
        True if successful, False otherwise
    """
    work_dir = Path(work_dir)
    work_dir.mkdir(parents=True, exist_ok=True)
    
    merged_path = work_dir / "merged"
    gguf_path = work_dir / f"{model_name}.gguf"
    modelfile_path = work_dir / "Modelfile"
    
    print(f"\n{'#'*60}")
    print(f"# Deploying {model_name} to Ollama")
    print(f"{'#'*60}")
    
    try:
        # Step 1: Merge
        print("\n" + "="*60)
        print("STEP 1: Merge LoRA weights")
        print("="*60)
        
        merged = merge_lora_complete(
            base_model,
            adapter_path,
            str(merged_path)
        )
        
        # Step 2: Convert to GGUF
        print("\n" + "="*60)
        print("STEP 2: Convert to GGUF")
        print("="*60)
        
        gguf = convert_to_gguf_complete(
            str(merged_path),
            str(gguf_path),
            quantization,
            llama_cpp_path
        )
        
        if gguf is None:
            print("GGUF conversion failed")
            return False
        
        # Step 3: Create Modelfile
        print("\n" + "="*60)
        print("STEP 3: Create Modelfile")
        print("="*60)
        
        create_modelfile_complete(
            gguf,
            model_name,
            str(modelfile_path)
        )
        
        # Step 4: Import to Ollama
        print("\n" + "="*60)
        print("STEP 4: Import to Ollama")
        print("="*60)
        
        cmd = ["ollama", "create", model_name, "-f", str(modelfile_path)]
        print(f"Running: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode != 0:
            print(f"Error: {result.stderr}")
            return False
        
        # Step 5: Verify
        print("\n" + "="*60)
        print("STEP 5: Verify deployment")
        print("="*60)
        
        result = subprocess.run(
            ["ollama", "list"],
            capture_output=True,
            text=True
        )
        
        if model_name in result.stdout:
            print(f"✓ Model '{model_name}' successfully deployed!")
        else:
            print(f"Warning: Model not found in ollama list")
        
        # Cleanup
        if cleanup_intermediate:
            print("\nCleaning up intermediate files...")
            import shutil
            if merged_path.exists():
                shutil.rmtree(merged_path)
            print("Cleanup complete")
        
        print(f"\n{'#'*60}")
        print(f"# Deployment complete!")
        print(f"# Test with: ollama run {model_name}")
        print(f"{'#'*60}")
        
        return True
        
    except Exception as e:
        print(f"\nDeployment failed: {e}")
        return False

print("Complete deployment function defined.")
print("\nUsage:")
print("  deploy_to_ollama(")
print("      'meta-llama/Llama-3.1-8B-Instruct',")
print("      './my-lora-adapter',")
print("      'my-finetuned-llama'")
print("  )")

## Summary

This solution notebook provides:

1. **Complete LoRA Merging** - Production-ready weight merging with progress reporting
2. **GGUF Conversion** - With validation and error handling
3. **Modelfile Generation** - Support for multiple model families
4. **Ollama Client** - Robust API client with timeout handling
5. **Benchmarking Suite** - Statistical analysis of model performance
6. **End-to-End Pipeline** - Single function to deploy from adapter to Ollama

All functions include proper error handling, progress reporting, and documentation.

In [None]:
# Cleanup
gc.collect()

if HAS_TORCH and torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Solution notebook complete!")