In [1]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os

In [2]:
# --- Model Download Setup ---
# 1. Define where you want to save the model
local_model_dir = "./pretrained_language_model"
os.makedirs(local_model_dir, exist_ok=True)

In [3]:
# 2. Specify the model repo and the specific GGUF file
model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
# Using Q4_K_M is a good balance for 16GB RAM for CPU inference.
# It's about 4.6GB.
filename = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

In [4]:
# 3. Download the GGUF file
print(f"Downloading {filename} from {model_id} to {local_model_dir}...")
model_path = hf_hub_download(
    repo_id=model_id,
    filename=filename,
    local_dir=local_model_dir,
    local_dir_use_symlinks=False # Ensure it's copied
)
print(f"Model downloaded to: {model_path}")

Downloading mistral-7b-instruct-v0.2.Q4_K_M.gguf from TheBloke/Mistral-7B-Instruct-v0.2-GGUF to ./pretrained_language_model...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


mistral-7b-instruct-v0.2.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

Model downloaded to: pretrained_language_model\mistral-7b-instruct-v0.2.Q4_K_M.gguf


In [5]:
# --- Model Loading and Inference ---
# 4. Load the model using llama-cpp-python
print("Loading the model...")
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Context window size. You can increase this if you need longer conversations,
                 # but it will use more RAM.
    n_gpu_layers=10, # <--- THIS IS THE KEY CHANGE for your MX550
                     # Start with a small number like 10-15 for a 7B model on 2GB VRAM.
                     # If it gives a "CUDA out of memory" error, reduce this number.
                     # If it runs, you can try increasing it slightly (e.g., 20, 25)
                     # to find the maximum that fits.
    verbose=True # Set to True to see more details during loading, including GPU layer offloading
)
print("Model loaded successfully!")

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from pretrained_language_model\mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: -

Loading the model...


load_tensors:  CPU_AARCH64 model buffer size =  3204.00 MiB
load_tensors:   CPU_Mapped model buffer size =  4165.37 MiB
repack: repack tensor blk.0.attn_q.weight with q4_K_8x8
repack: repack tensor blk.0.attn_k.weight with q4_K_8x8
repack: repack tensor blk.0.attn_output.weight with q4_K_8x8
repack: repack tensor blk.0.ffn_gate.weight with q4_K_8x8
.repack: repack tensor blk.0.ffn_up.weight with q4_K_8x8
repack: repack tensor blk.1.attn_q.weight with q4_K_8x8
.repack: repack tensor blk.1.attn_k.weight with q4_K_8x8
repack: repack tensor blk.1.attn_output.weight with q4_K_8x8
repack: repack tensor blk.1.ffn_gate.weight with q4_K_8x8
.repack: repack tensor blk.1.ffn_up.weight with q4_K_8x8
repack: repack tensor blk.2.attn_q.weight with q4_K_8x8
.repack: repack tensor blk.2.attn_k.weight with q4_K_8x8
repack: repack tensor blk.2.attn_output.weight with q4_K_8x8
repack: repack tensor blk.2.ffn_gate.weight with q4_K_8x8
.repack: repack tensor blk.2.ffn_up.weight with q4_K_8x8
repack: repack

Model loaded successfully!


CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 | 
Model metadata: {'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'general.architecture': 'llama', 'llama.context_length': '32768', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '4096', 'llama.block_count': '32', 'llama.feed_forward_length': '14336', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '15', 'llama.attention.head_count_kv': '8', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '1000000.000000', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user')

In [6]:
# 5. Generate text for tagline
print("\n--- Generating Tagline ---")
prompt_tagline = "Write a creative tagline for a new coffee shop called 'The Daily Grind'."
output_tagline = llm(
    prompt_tagline,
    max_tokens=50,
    stop=["\n", "Q:"], # Stop generation at a newline or "Q:"
    echo=False # Don't echo the prompt back
)
print("Generated tagline:")
print(output_tagline["choices"][0]["text"].strip()) # Use .strip() to clean up whitespace

# Example for a simple chat interaction
print("\n--- Chat Example ---")

# Define a function for cleaner chat interaction
def generate_chat_response(llm_instance, user_input_text, max_tokens=150):
    # This format is common for Mistral-like models for chat
    formatted_prompt = f"<s>[INST] {user_input_text} [/INST]"
    chat_output = llm_instance(
        formatted_prompt,
        max_tokens=max_tokens,
        stop=["</s>"], # Stop at the end of response marker for Mistral
        echo=False
    )
    return chat_output["choices"][0]["text"].strip()

user_query = "What are the benefits of learning Python?"
print(f"User: {user_query}")
chat_response_text = generate_chat_response(llm, user_query)
print(f"LLM: {chat_response_text}")

user_query_2 = "Can you give me a simple example of a Python for loop?"
print(f"User: {user_query_2}")
chat_response_text_2 = generate_chat_response(llm, user_query_2, max_tokens=100)
print(f"LLM: {chat_response_text_2}")


--- Generating Tagline ---


llama_perf_context_print:        load time =    1550.20 ms
llama_perf_context_print: prompt eval time =    1548.35 ms /    18 tokens (   86.02 ms per token,    11.63 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1550.93 ms /    19 tokens
Llama.generate: 1 prefix-match hit, remaining 16 prompt tokens to eval


Generated tagline:


--- Chat Example ---
User: What are the benefits of learning Python?


llama_perf_context_print:        load time =    1550.20 ms
llama_perf_context_print: prompt eval time =     917.37 ms /    16 tokens (   57.34 ms per token,    17.44 tokens per second)
llama_perf_context_print:        eval time =   28705.57 ms /   149 runs   (  192.65 ms per token,     5.19 tokens per second)
llama_perf_context_print:       total time =   29725.47 ms /   165 tokens
Llama.generate: 5 prefix-match hit, remaining 17 prompt tokens to eval


LLM: Python is a versatile, high-level, and popular programming language that offers numerous benefits for learners, especially for those new to programming. Here are some advantages of learning Python:

1. Easy to Learn: Python has a clean and simple syntax that is easy to learn, even for those without a background in programming. This makes it an excellent choice for beginners.

2. Versatile: Python is used extensively in various fields such as web development, data analysis, machine learning, artificial intelligence, scientific computing, and automation, to name a few. Learning Python can open up a wide range of opportunities for career advancement.

3. Large Community and Support: Python has a large and active community,
User: Can you give me a simple example of a Python for loop?


llama_perf_context_print:        load time =    1550.20 ms
llama_perf_context_print: prompt eval time =    1206.36 ms /    17 tokens (   70.96 ms per token,    14.09 tokens per second)
llama_perf_context_print:        eval time =   19415.94 ms /    99 runs   (  196.12 ms per token,     5.10 tokens per second)
llama_perf_context_print:       total time =   20682.91 ms /   116 tokens


LLM: Certainly! Here's a simple example of a `for` loop in Python that prints the numbers from 1 to 5:

```python
# Using a for loop to iterate through a list of numbers
numbers = [1, 2, 3, 4, 5]

for number in numbers:
    print(number)
```

When you run this code, it will display each number on a new line.
