# Lab 3.1.10: Ollama Integration - Solutions

Complete solutions for deploying fine-tuned models with Ollama.

## Exercise 1: Complete Deployment Pipeline

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from pathlib import Path
import subprocess
import shutil

def complete_deployment_pipeline(
    base_model_id: str,
    adapter_path: str,
    ollama_model_name: str,
    system_prompt: str,
    quantization: str = "q4_k_m",
    work_dir: str = "./deployment"
):
    """
    Complete pipeline from LoRA adapter to running Ollama model.
    
    Steps:
    1. Merge LoRA adapter into base model
    2. Convert to GGUF format
    3. Create Modelfile
    4. Deploy with Ollama
    """
    work_path = Path(work_dir)
    work_path.mkdir(parents=True, exist_ok=True)
    
    print("=" * 60)
    print("COMPLETE DEPLOYMENT PIPELINE")
    print("=" * 60)
    
    # Step 1: Merge LoRA
    print("\n[1/4] Merging LoRA adapter...")
    merged_path = work_path / "merged"
    
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    
    model = PeftModel.from_pretrained(base_model, adapter_path)
    merged_model = model.merge_and_unload()
    
    merged_model.save_pretrained(str(merged_path))
    tokenizer.save_pretrained(str(merged_path))
    print(f"   Merged model saved to: {merged_path}")
    
    # Step 2: Convert to GGUF
    print("\n[2/4] Converting to GGUF...")
    llama_cpp_path = Path("./llama.cpp")
    
    if not llama_cpp_path.exists():
        subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True)
        subprocess.run(["make", "-j"], cwd=llama_cpp_path, check=True)
    
    gguf_path = work_path / "gguf"
    gguf_path.mkdir(exist_ok=True)
    
    # Convert to FP16
    fp16_file = gguf_path / "model-f16.gguf"
    subprocess.run([
        "python", str(llama_cpp_path / "convert_hf_to_gguf.py"),
        str(merged_path),
        "--outfile", str(fp16_file),
        "--outtype", "f16"
    ], check=True)
    
    # Quantize
    quantized_file = gguf_path / f"model-{quantization}.gguf"
    subprocess.run([
        str(llama_cpp_path / "llama-quantize"),
        str(fp16_file),
        str(quantized_file),
        quantization.upper()
    ], check=True)
    
    fp16_file.unlink()  # Remove FP16 to save space
    print(f"   GGUF saved to: {quantized_file}")
    
    # Step 3: Create Modelfile
    print("\n[3/4] Creating Modelfile...")
    modelfile_content = f'''FROM {quantized_file}

SYSTEM """{system_prompt}"""

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER num_ctx 4096
'''
    
    modelfile_path = work_path / "Modelfile"
    modelfile_path.write_text(modelfile_content)
    print(f"   Modelfile saved to: {modelfile_path}")
    
    # Step 4: Deploy with Ollama
    print("\n[4/4] Creating Ollama model...")
    result = subprocess.run(
        ["ollama", "create", ollama_model_name, "-f", str(modelfile_path)],
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        print(f"   Model created: {ollama_model_name}")
    else:
        print(f"   Error: {result.stderr}")
        return False
    
    # Cleanup merged model (keep GGUF for backup)
    shutil.rmtree(merged_path)
    
    print("\n" + "=" * 60)
    print("SUCCESS! Run your model with:")
    print(f"   ollama run {ollama_model_name}")
    print("=" * 60)
    
    return True

# Example usage:
# complete_deployment_pipeline(
#     base_model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     adapter_path="./my-lora-adapter",
#     ollama_model_name="my-assistant",
#     system_prompt="You are a helpful AI assistant.",
#     quantization="q4_k_m"
# )

## Exercise 2: Custom Modelfile Generator

In [None]:
from dataclasses import dataclass, field
from typing import Dict, Any, Optional

@dataclass
class ModelfileBuilder:
    """
    Build Ollama Modelfiles with proper configuration.
    """
    base_model: str  # Path to GGUF or Ollama model name
    system_prompt: str = "You are a helpful AI assistant."
    template: Optional[str] = None
    parameters: Dict[str, Any] = field(default_factory=dict)
    license_text: Optional[str] = None
    
    # Common templates
    TEMPLATES = {
        "llama3": """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|><|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{ .Response }}<|eot_id|>""",
        "chatml": """<|im_start|>system
{{ .System }}<|im_end|>
<|im_start|>user
{{ .Prompt }}<|im_end|>
<|im_start|>assistant
{{ .Response }}<|im_end|>""",
        "alpaca": """### Instruction:
{{ .System }}

{{ .Prompt }}

### Response:
{{ .Response }}"""
    }
    
    # Default parameters by use case
    PRESETS = {
        "creative": {
            "temperature": 0.9,
            "top_p": 0.95,
            "top_k": 50,
            "num_ctx": 4096
        },
        "precise": {
            "temperature": 0.3,
            "top_p": 0.7,
            "top_k": 20,
            "num_ctx": 4096
        },
        "balanced": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40,
            "num_ctx": 4096
        },
        "code": {
            "temperature": 0.2,
            "top_p": 0.6,
            "top_k": 10,
            "num_ctx": 8192,
            "stop": ["```\n", "</code>"]
        }
    }
    
    def use_template(self, name: str) -> 'ModelfileBuilder':
        """Use a predefined template."""
        if name in self.TEMPLATES:
            self.template = self.TEMPLATES[name]
        return self
    
    def use_preset(self, name: str) -> 'ModelfileBuilder':
        """Use a parameter preset."""
        if name in self.PRESETS:
            self.parameters.update(self.PRESETS[name])
        return self
    
    def set_parameter(self, key: str, value: Any) -> 'ModelfileBuilder':
        """Set a single parameter."""
        self.parameters[key] = value
        return self
    
    def build(self) -> str:
        """Generate the Modelfile content."""
        lines = [f"FROM {self.base_model}", ""]
        
        # System prompt
        lines.append(f'SYSTEM """{self.system_prompt}"""')
        lines.append("")
        
        # Template
        if self.template:
            lines.append(f'TEMPLATE """{self.template}"""')
            lines.append("")
        
        # Parameters
        for key, value in self.parameters.items():
            if isinstance(value, list):
                for v in value:
                    lines.append(f'PARAMETER {key} "{v}"')
            else:
                lines.append(f"PARAMETER {key} {value}")
        lines.append("")
        
        # License
        if self.license_text:
            lines.append(f'LICENSE """{self.license_text}"""')
        
        return "\n".join(lines)
    
    def save(self, path: str):
        """Save Modelfile to disk."""
        Path(path).write_text(self.build())
        print(f"Saved Modelfile to: {path}")

# Examples
print("Code Assistant Modelfile:")
print("-" * 40)
modelfile = ModelfileBuilder(
    base_model="./model-q4_k_m.gguf",
    system_prompt="You are an expert Python programmer. Write clean, efficient code with comments."
).use_template("llama3").use_preset("code").build()
print(modelfile)

## Exercise 3: Ollama API Client

In [None]:
import requests
import json
from typing import Iterator, List, Dict, Optional

class OllamaClient:
    """
    Full-featured Ollama API client.
    """
    
    def __init__(self, host: str = "http://localhost:11434"):
        self.host = host
        self.api_url = f"{host}/api"
    
    def _check_running(self) -> bool:
        try:
            requests.get(f"{self.host}/api/tags", timeout=2)
            return True
        except:
            return False
    
    def list_models(self) -> List[Dict]:
        """List all available models."""
        response = requests.get(f"{self.api_url}/tags")
        return response.json().get("models", [])
    
    def generate(
        self,
        model: str,
        prompt: str,
        system: Optional[str] = None,
        **kwargs
    ) -> str:
        """Generate a completion."""
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": False,
            **kwargs
        }
        if system:
            payload["system"] = system
        
        response = requests.post(f"{self.api_url}/generate", json=payload)
        return response.json()["response"]
    
    def generate_stream(
        self,
        model: str,
        prompt: str,
        **kwargs
    ) -> Iterator[str]:
        """Stream a completion token by token."""
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": True,
            **kwargs
        }
        
        response = requests.post(
            f"{self.api_url}/generate",
            json=payload,
            stream=True
        )
        
        for line in response.iter_lines():
            if line:
                chunk = json.loads(line)
                yield chunk.get("response", "")
    
    def chat(
        self,
        model: str,
        messages: List[Dict[str, str]],
        **kwargs
    ) -> str:
        """Chat completion (OpenAI-compatible format)."""
        payload = {
            "model": model,
            "messages": messages,
            "stream": False,
            **kwargs
        }
        
        response = requests.post(f"{self.api_url}/chat", json=payload)
        return response.json()["message"]["content"]
    
    def embeddings(
        self,
        model: str,
        text: str
    ) -> List[float]:
        """Get embeddings for text."""
        response = requests.post(
            f"{self.api_url}/embeddings",
            json={"model": model, "prompt": text}
        )
        return response.json()["embedding"]
    
    def create_model(
        self,
        name: str,
        modelfile: str
    ) -> bool:
        """Create a model from Modelfile content."""
        response = requests.post(
            f"{self.api_url}/create",
            json={"name": name, "modelfile": modelfile},
            stream=True
        )
        
        for line in response.iter_lines():
            if line:
                status = json.loads(line)
                print(f"   {status.get('status', '')}")
        
        return response.status_code == 200
    
    def delete_model(self, name: str) -> bool:
        """Delete a model."""
        response = requests.delete(
            f"{self.api_url}/delete",
            json={"name": name}
        )
        return response.status_code == 200

# Demo
client = OllamaClient()

if client._check_running():
    models = client.list_models()
    print(f"Available models: {len(models)}")
    for m in models[:5]:
        print(f"  - {m['name']}")
else:
    print("Ollama not running. Start with: ollama serve")

## Exercise 4: OpenAI Drop-in Replacement

In [None]:
from openai import OpenAI

def demonstrate_openai_compatibility():
    """
    Show how Ollama can replace OpenAI with one line change.
    """
    print("OpenAI API Compatibility Demo")
    print("=" * 50)
    
    # Original OpenAI code:
    # client = OpenAI(api_key="sk-...")
    
    # With Ollama - just change base_url!
    client = OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama"  # Ollama ignores this, but OpenAI client requires it
    )
    
    # Now use exactly like OpenAI!
    try:
        response = client.chat.completions.create(
            model="llama3.2:1b",  # or your custom model
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "What is 2+2?"}
            ],
            temperature=0.7
        )
        
        print(f"\nModel: {response.model}")
        print(f"Response: {response.choices[0].message.content}")
        print(f"Tokens: {response.usage}")
        
    except Exception as e:
        print(f"Error: {e}")
        print("\nMake sure Ollama is running and has a model loaded.")
    
    print("\n" + "=" * 50)
    print("Key advantage: Your existing OpenAI code works with Ollama!")

# Run demo
# demonstrate_openai_compatibility()

## Exercise 5: Simple Chat Application

In [None]:
class SimpleChat:
    """
    Simple chat application using Ollama.
    """
    
    def __init__(
        self,
        model: str,
        system_prompt: str = "You are a helpful AI assistant.",
        host: str = "http://localhost:11434"
    ):
        self.model = model
        self.system_prompt = system_prompt
        self.host = host
        self.history = []
    
    def chat(self, user_message: str) -> str:
        """Send a message and get a response."""
        # Add to history
        self.history.append({"role": "user", "content": user_message})
        
        # Build messages with system prompt
        messages = [
            {"role": "system", "content": self.system_prompt}
        ] + self.history
        
        # Call API
        response = requests.post(
            f"{self.host}/api/chat",
            json={
                "model": self.model,
                "messages": messages,
                "stream": False
            }
        )
        
        assistant_message = response.json()["message"]["content"]
        
        # Add to history
        self.history.append({"role": "assistant", "content": assistant_message})
        
        return assistant_message
    
    def clear_history(self):
        """Clear conversation history."""
        self.history = []
        print("History cleared.")
    
    def run_interactive(self):
        """Run interactive chat session."""
        print(f"Chat with {self.model}")
        print("Type 'quit' to exit, 'clear' to reset history")
        print("-" * 50)
        
        while True:
            try:
                user_input = input("You: ").strip()
                
                if user_input.lower() == 'quit':
                    print("Goodbye!")
                    break
                elif user_input.lower() == 'clear':
                    self.clear_history()
                    continue
                elif not user_input:
                    continue
                
                response = self.chat(user_input)
                print(f"AI: {response}\n")
                
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break

# Usage:
# chat = SimpleChat("my-assistant", "You are a coding expert.")
# chat.run_interactive()

## Key Takeaways

1. **Deployment Pipeline**: Merge → GGUF → Modelfile → Ollama
2. **Modelfile**: Configure system prompt, template, and parameters
3. **API Client**: REST API for generate, chat, and embeddings
4. **OpenAI Compatible**: Change `base_url` to use Ollama
5. **Chat Apps**: Build interactive apps with conversation history