# Installing Required Packages

In [8]:
!pip install llama-cpp-python



# Loading the Model

In [11]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="microsoft/phi-4-gguf",
    filename="phi-4-q4.gguf",
    n_gpu_layers=-1,  # Offload all layers to GPU
    n_ctx=16384,        # Context size is set to 512 based on the logs
    verbose=True      # Optional: to see detailed loading info
)

llama_model_loader: loaded meta data with 33 key-value pairs and 243 tensors from /root/.cache/huggingface/hub/models--microsoft--phi-4-gguf/snapshots/b1e764cfdbdd0a3ed824d6a8424129eb0a2232ff/./phi-4-q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Phi 4
llama_model_loader: - kv   3:                            general.version str              = 4
llama_model_loader: - kv   4:                       general.organization str              = Microsoft
llama_model_loader: - kv   5:                           general.basename str              = phi
llama_model_loader: - kv   6:                         general.size_label str      

# Defining Prompts

In [12]:
system_prompt = """
You are a JSON-only response system. Follow these rules absolutely:
1. ONLY output valid, parseable JSON
2. NEVER include text before or after the JSON
3. NEVER include markdown code blocks or formatting
4. NEVER include explanations
5. If you can't fulfill a request, return {"error": "error message"}
6. Output should always be a single JSON object

For address requests, use this format:
{
    "address": {
        "license": "B1231241",
        "Address": "X City",
        "Sex": "Male",
        "Weight": "X",
        "Height": "X"
    }
}
"""

In [4]:
test_instruction = '''extract NER:
        California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009
'''

# Inference

In [5]:
import time  # Import the time module

# --- Assuming generate_text, model, tokenizer, test_instruction are defined elsewhere ---

print("Starting text generation...") # Optional: Indicate start

start_time = time.perf_counter()  # Get the time just before the call

####################



for chunk in llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": test_instruction
        }
    ],
    stream=True
):
    print(chunk['choices'][0]['delta'].get('content', ''), end='', flush=True)




#########################################

end_time = time.perf_counter()    # Get the time just after the call

elapsed_time = end_time - start_time # Calculate the difference

# Print the elapsed time
print(f"\nTime taken: {elapsed_time:.4f} seconds")

Starting text generation...
```json
{
    "address": {
        "license": "dl 11234568",
        "Address": "2570 24th Street, Anytown, CA 95818",
        "Sex": "F",
        "Weight": "125 lb",
        "Height": "5'-05\""
    },
    "additional_info": {
        "state": "California",
        "license_class": "CLASS C",
        "expiration_date": "08/31/2014",
        "license_holder_name": "LNCARDHOLDER FNIMA",
        "date_of_birth": "08/31/1977",
        "veteran_status": "VETERAN",
        "hair_color": "BRN",
        "eye_color": "BRN"
    }
}
```

llama_perf_context_print:        load time =  205192.00 ms
llama_perf_context_print: prompt eval time =  205191.39 ms /   296 tokens (  693.21 ms per token,     1.44 tokens per second)
llama_perf_context_print:        eval time =  204388.84 ms /   164 runs   ( 1246.27 ms per token,     0.80 tokens per second)
llama_perf_context_print:       total time =  409932.77 ms /   460 tokens



Time taken: 409.9410 seconds


# Phi-4-mini-instruct-Q4_K_M.gguf

# Loading the Model

In [9]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="unsloth/Phi-4-mini-instruct-GGUF",
	filename="Phi-4-mini-instruct-Q4_K_M.gguf",
 	n_gpu_layers=-1  # Use -1 to offload all layers to GPU
)

llama_model_loader: loaded meta data with 35 key-value pairs and 196 tensors from /root/.cache/huggingface/hub/models--unsloth--Phi-4-mini-instruct-GGUF/snapshots/78eb92a46fc37e6b524df991ed9aca9bc6aa7b80/./Phi-4-mini-instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:              phi3.rope.scaling.attn_factor f32              = 1.190238
llama_model_loader: - kv   2:                               general.type str              = model
llama_model_loader: - kv   3:                               general.name str              = Phi 4 Mini Instruct
llama_model_loader: - kv   4:                       general.organization str              = Microsoft
llama_model_loader: - kv   5:                           general.finetune str              = instruct
llama_model_loader: - kv  

# Inference

In [10]:
import time  # Import the time module

# --- Assuming generate_text, model, tokenizer, test_instruction are defined elsewhere ---

print("Starting text generation...") # Optional: Indicate start

start_time = time.perf_counter()  # Get the time just before the call

####################



for chunk in llm.create_chat_completion(
	messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": test_instruction
        }
	],
    stream=True
):
    print(chunk['choices'][0]['delta'].get('content', ''), end='', flush=True)




#########################################

end_time = time.perf_counter()    # Get the time just after the call

elapsed_time = end_time - start_time # Calculate the difference

# Print the elapsed time
print(f"\nTime taken: {elapsed_time:.4f} seconds")

Starting text generation...
```json
{
    "entities": {
        "Location": ["California"],
        "License": ["DRIVER LICENSe"],
        "LicenseNumber": ["dl 11234568"],
        "LicenseClass": ["CLASS C"],
        "ExpirationDate": ["EXP 08/31/2014"],
        "LicenseType": ["END NONE"],
        "LicenseHolderName": ["LNCARDHOLDER FNIMA"],
        "Address": ["2570 24TH STREET ANYTOWN, CA 95818"],
        "BirthDate": ["doB 08/31/1977 RSTR NONE"],
        "BirthYear": ["08311977"],
        "MilitaryStatus": ["VETERAN"],
        "Name": ["Cordhslde"],
        "Gender": ["SEX F"],
        "Height": ["HGT 5'-05\""],
        "HairColor": ["HAIR BRN"],
        "HairWeight": ["WGT 125 lb"],
        "EyeColor": ["EYES BRN"],
        "DateOfBirth": ["DD 00/00/

llama_perf_context_print:        load time =   71097.75 ms
llama_perf_context_print: prompt eval time =   71093.74 ms /   289 tokens (  246.00 ms per token,     4.07 tokens per second)
llama_perf_context_print:        eval time =  115361.40 ms /   222 runs   (  519.65 ms per token,     1.92 tokens per second)
llama_perf_context_print:       total time =  187415.79 ms /   511 tokens



Time taken: 187.4341 seconds


# phi-4-Q2_K.gguf (Slow for unknown reason)

# Loading the Model

In [11]:
from llama_cpp import Llama

llm_Q2 = Llama.from_pretrained(
	repo_id="unsloth/phi-4-GGUF",
	filename="phi-4-Q2_K.gguf",
 	n_gpu_layers=-1  # Use -1 to offload all layers to GPU
)

llama_model_loader: loaded meta data with 40 key-value pairs and 363 tensors from /root/.cache/huggingface/hub/models--unsloth--phi-4-GGUF/snapshots/5110b7771e8166d5530e73346a15aea096a8cb99/./phi-4-Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Phi 4
llama_model_loader: - kv   3:                            general.version str              = 4
llama_model_loader: - kv   4:                           general.basename str              = phi
llama_model_loader: - kv   5:                         general.size_label str              = 15B
llama_model_loader: - kv   6:                            general.license str           

# Inference

In [12]:
import time  # Import the time module

# --- Assuming generate_text, model, tokenizer, test_instruction are defined elsewhere ---

print("Starting text generation...") # Optional: Indicate start

start_time = time.perf_counter()  # Get the time just before the call

####################



for chunk in llm_Q2.create_chat_completion(
	messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": test_instruction
        }
	],
    stream=True
):
    print(chunk['choices'][0]['delta'].get('content', ''), end='', flush=True)




#########################################

end_time = time.perf_counter()    # Get the time just after the call

elapsed_time = end_time - start_time # Calculate the difference

# Print the elapsed time
print(f"\nTime taken: {elapsed_time:.4f} seconds")

Starting text generation...
```json
{
    "address": {
        "license": "dl 11234568",
        "Address": "2570 24TH STREET ANYTOWN, CA 95818",
        "Sex": "F",
        "Weight": "125",
        "Height": "5'-05\""
    }
}
```

llama_perf_context_print:        load time =  251872.46 ms
llama_perf_context_print: prompt eval time =  251871.66 ms /   296 tokens (  850.92 ms per token,     1.18 tokens per second)
llama_perf_context_print:        eval time =   97491.32 ms /    68 runs   ( 1433.70 ms per token,     0.70 tokens per second)
llama_perf_context_print:       total time =  349588.50 ms /   364 tokens



Time taken: 349.6093 seconds
