In [2]:
from llama_cpp import Llama
import os
import sys


In [4]:
def load_model(model_path, gpu_layers=-1, context_size=2048):
    """Load a GGUF model with GPU acceleration"""
    if not os.path.exists(model_path):
        print(f"❌ Model not found: {model_path}")
        return None
    
    print(f"🚀 Loading model: {os.path.basename(model_path)}")
    print(f"📍 Path: {model_path}")
    print(f"🎮 GPU layers: {gpu_layers}")
    print(f"📝 Context size: {context_size}")
    
    try:
        llm = Llama(
            model_path=model_path,
            n_gpu_layers=gpu_layers,  # -1 = all layers on GPU
            n_ctx=context_size,
            n_batch=512,
            verbose=True
        )
        print("✅ Model loaded successfully!")
        return llm
    except Exception as e:
        print(f"❌ Failed to load model: {e}")
        return None

def chat_with_model(llm, prompt, max_tokens=256, temperature=0.7):
    """Chat with the loaded model"""
    if not llm:
        print("❌ No model loaded!")
        return
    
    print(f"\n💬 You: {prompt}")
    print("🤖 AI: ", end="", flush=True)
    
    try:
        response = llm(
            prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            stop=["User:", "\n\n"]
        )
        
        if response and 'choices' in response:
            ai_response = response['choices'][0]['text'].strip()
            print(ai_response)
            return ai_response
        else:
            print("❌ No response generated")
            return None
            
    except Exception as e:
        print(f"❌ Error generating response: {e}")
        return None

In [9]:
# Get model path from user
model_path = r"/home/parhamhard/projects/llm-web/text-generation-webui/user_data/models/TheBloke/dolphin-2_6-phi-2-GGUF/dolphin-2_6-phi-2.Q5_K_S.gguf"

# Load the model
llm = load_model(model_path)


🚀 Loading model: dolphin-2_6-phi-2.Q5_K_S.gguf
📍 Path: /home/parhamhard/projects/llm-web/text-generation-webui/user_data/models/TheBloke/dolphin-2_6-phi-2-GGUF/dolphin-2_6-phi-2.Q5_K_S.gguf
🎮 GPU layers: -1
📝 Context size: 2048


llama_model_loader: loaded meta data with 22 key-value pairs and 325 tensors from /home/parhamhard/projects/llm-web/text-generation-webui/user_data/models/TheBloke/dolphin-2_6-phi-2-GGUF/dolphin-2_6-phi-2.Q5_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi2
llama_model_loader: - kv   1:                               general.name str              = Phi2
llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048
llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560
llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240
llama_model_loader: - kv   5:                           phi2.block_count u32              = 32
llama_model_loader: - kv   6:                  phi2.attention.head_count u32 

✅ Model loaded successfully!


llama_new_context_with_model: KV self size  =  640.00 MiB, K (f16):  320.00 MiB, V (f16):  320.00 MiB
llama_build_graph: non-view tensors processed: 774/774
llama_new_context_with_model: compute buffer total size = 165.19 MiB
llama_new_context_with_model: VRAM scratch buffer: 162.00 MiB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
llama_new_context_with_model: total VRAM used: 1918.20 MiB (model: 1756.19 MiB, context: 162.00 MiB)


In [None]:
# Example prompt to get a response from the model
prompt = "What are the main differences between artificial intelligence and machine learning?"

# Get a response from the model
chat_with_model(llm, prompt)



💬 You: What are the main differences between artificial intelligence and machine learning?
🤖 AI: 
⚠️ The model returned an empty response (''). This may indicate an issue with the prompt, model, or configuration.


Llama.generate: prefix-match hit

llama_print_timings:        load time =      66.41 ms
llama_print_timings:      sample time =       0.45 ms /     2 runs   (    0.22 ms per token,  4494.38 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =      27.89 ms /     2 runs   (   13.94 ms per token,    71.72 tokens per second)
llama_print_timings:       total time =      31.50 ms
