# Local LLM Chat Demo - Phi-3-mini GGUF

This notebook demonstrates a chat application using Microsoft's Phi-3-mini model with GGUF quantization for optimal CPU performance.

In [1]:
# Install and import required libraries
from llama_cpp import Llama
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
from huggingface_hub import hf_hub_download
import warnings
import os
warnings.filterwarnings('ignore')

print("Using llama-cpp-python for GGUF model support")
print("\nNote: First model load will download ~2.4GB. Subsequent runs will use cache.")

Using llama-cpp-python for GGUF model support

Note: First model load will download ~2.4GB. Subsequent runs will use cache.


## Load the Model

We're using Phi-3-mini-4k-instruct Q4_K_M quantization for optimal CPU performance.

In [2]:
# Download the GGUF model
model_name = "microsoft/Phi-3-mini-4k-instruct-gguf"
model_file = "Phi-3-mini-4k-instruct-q4.gguf"  # Q4_K_M quantized version

print(f"Downloading {model_file} from Hugging Face...")
model_path = hf_hub_download(
    repo_id=model_name,
    filename=model_file,
    resume_download=True
)
print(f"Model downloaded to: {model_path}")

# Load the model
print("\nLoading model... This may take a moment.")
llm = Llama(
    model_path=model_path,
    n_ctx=4096,  # Context window
    n_threads=os.cpu_count() - 1,  # Use all CPU threads minus one
    n_gpu_layers=0,  # CPU only
    verbose=False
)
print(f"Model loaded: Phi-3-mini-4k-instruct (Q4_K_M)")
print(f"Using {os.cpu_count() - 1} CPU threads")

Downloading Phi-3-mini-4k-instruct-q4.gguf from Hugging Face...


Phi-3-mini-4k-instruct-q4.gguf:   0%|          | 0.00/2.39G [00:00<?, ?B/s]

Model downloaded to: /home/randall/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct-gguf/snapshots/999f761fe19e26cf1a339a5ec5f9f201301cbb83/Phi-3-mini-4k-instruct-q4.gguf

Loading model... This may take a moment.
Model loaded: Phi-3-mini-4k-instruct (Q4_K_M)
Using 7 CPU threads


## Chat Interface Setup

In [3]:
# Initialize chat history
chat_history = []

def format_chat_prompt(message, history):
    """Format the chat history for Phi-3 using the official template."""
    prompt = "<|system|>\nYou are a helpful AI assistant.<|end|>\n"
    
    for user_msg, assistant_msg in history:
        prompt += f"<|user|>\n{user_msg}<|end|>\n"
        prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
    
    prompt += f"<|user|>\n{message}<|end|>\n<|assistant|>\n"
    return prompt

def generate_response(message, history, max_tokens=200, temperature=0.7):
    """Generate a response from the model."""
    prompt = format_chat_prompt(message, history)
    
    # Generate response
    response = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        stop=["<|end|>", "<|user|>", "<|assistant|>"],
        echo=False
    )
    
    # Extract the generated text
    response_text = response['choices'][0]['text'].strip()
    
    return response_text

## Interactive Chat Widget

In [4]:
# Create chat interface widgets
chat_output = widgets.Output()
input_text = widgets.Text(
    placeholder='Type your message here...',
    description='You:',
    layout=widgets.Layout(width='70%')
)
send_button = widgets.Button(
    description='Send',
    button_style='primary'
)
clear_button = widgets.Button(
    description='Clear Chat',
    button_style='warning'
)

# Settings
temp_slider = widgets.FloatSlider(
    value=0.7,
    min=0.1,
    max=1.0,
    step=0.1,
    description='Temperature:',
    tooltip='Higher = more creative, Lower = more focused'
)
max_tokens_slider = widgets.IntSlider(
    value=200,
    min=50,
    max=500,
    step=50,
    description='Max Tokens:',
    tooltip='Maximum length of response'
)

def display_message(role, message):
    """Display a message in the chat output."""
    with chat_output:
        if role == "user":
            display(HTML(f'<div style="background-color:#e3f2fd; color:#1e3a5f; padding:10px; margin:5px; border-radius:10px;"><b>You:</b> {message}</div>'))
        else:
            display(HTML(f'<div style="background-color:#f0f0f0; color:#333333; padding:10px; margin:5px; border-radius:10px;"><b>Assistant:</b> {message}</div>'))

def on_send_click(b):
    """Handle send button click."""
    global chat_history
    
    user_message = input_text.value
    if not user_message:
        return
    
    # Display user message
    display_message("user", user_message)
    
    # Show thinking indicator
    with chat_output:
        display(HTML('<div style="color:#666; font-style:italic;">Assistant is thinking...</div>'))
    
    # Generate response
    response = generate_response(
        user_message, 
        chat_history,
        max_tokens=max_tokens_slider.value,
        temperature=temp_slider.value
    )
    
    # Clear thinking indicator and display response
    with chat_output:
        clear_output()
        # Redisplay chat history
        for user_msg, assistant_msg in chat_history:
            display_message("user", user_msg)
            display_message("assistant", assistant_msg)
        display_message("user", user_message)
        display_message("assistant", response)
    
    # Update chat history
    chat_history.append((user_message, response))
    
    # Clear input
    input_text.value = ""

def on_clear_click(b):
    """Handle clear button click."""
    global chat_history
    chat_history = []
    with chat_output:
        clear_output()
        display(HTML('<div style="color:#666; font-style:italic;">Chat cleared. Start a new conversation!</div>'))

# Connect buttons to functions
send_button.on_click(on_send_click)
clear_button.on_click(on_clear_click)
input_text.on_submit(on_send_click)

# Display the chat interface
display(HTML('<h3 style="color:#333;">Chat with Phi-3-mini (GGUF)</h3>'))
display(widgets.VBox([
    widgets.HBox([temp_slider, max_tokens_slider]),
    chat_output,
    widgets.HBox([input_text, send_button, clear_button])
]))

# Initial greeting
with chat_output:
    display(HTML('<div style="color:#666; font-style:italic;">Hello! I\'m Phi-3-mini running locally with GGUF quantization. Type a message to start chatting!</div>'))

VBox(children=(HBox(children=(FloatSlider(value=0.7, description='Temperature:', max=1.0, min=0.1, tooltip='Hi…

## Simple Text-Based Chat (Alternative)

If you prefer a simpler interface without widgets:

In [None]:
def simple_chat():
    """Simple text-based chat function."""
    print("Simple Chat with Phi-3-mini")
    print("Type 'quit' to exit")
    print("-" * 50)
    
    history = []
    
    while True:
        user_input = input("\nYou: ")
        
        if user_input.lower() == 'quit':
            print("Goodbye!")
            break
        
        print("Assistant is thinking...")
        response = generate_response(user_input, history)
        print(f"\nAssistant: {response}")
        
        history.append((user_input, response))

# Uncomment to run the simple chat
# simple_chat()

## Performance Tips

1. **CPU Threads**: The model is configured to use all available CPU threads minus one
2. **Context Length**: Set to 4096 tokens (Phi-3's native context)
3. **Quantization**: Q4_K_M provides the best balance of quality and speed
4. **Temperature**: Lower values (0.3-0.5) for more focused responses, higher (0.7-0.9) for creativity

This model should run 2-3x faster than the previous Phi-2 float32 version!