In [None]:
!pip install openvino-genai

In [None]:
!pip install optimum-intel nncf==2.11 onnx==1.16.1
!pip install --pre openvino==2024.3.0.dev20240807 openvino-tokenizers==2024.3.0.0.dev20240807 openvino-genai==2024.3.0.0.dev20240807 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release

In [None]:
!optimum-cli export openvino -m microsoft/phi-2 --weight-format int4 --sym --group-size 128 --ratio 1.0 phi_2

In [None]:
!optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-Chat-v1.0 --weight-format int4 --sym --group-size 128 --ratio 1.0 TinyLlama

In [None]:
import openvino as ov

In [None]:
print(ov.Core().available_devices)

In [None]:
import openvino_genai as ov_genai
import time
import psutil
import statistics

In [None]:
process = psutil.Process()
start_memory_mb = process.memory_info().rss / (1024 * 1024)

In [None]:
start_memory_mb = process.memory_info().rss / (1024 * 1024)
pipe_npu = ov_genai.LLMPipeline("TinyLlama", "NPU")
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print(f"Memory used during compilation on NPU: {memory_used_mb:.2f} MB")

In [None]:
start_memory_mb = process.memory_info().rss / (1024 * 1024)
pipe_gpu = ov_genai.LLMPipeline("TinyLlama", "GPU")
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print(f"Memory used during compilation on GPU: {memory_used_mb:.2f} MB")

In [None]:
start_memory_mb = process.memory_info().rss / (1024 * 1024)
pipe_cpu = ov_genai.LLMPipeline("TinyLlama", "CPU")
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print(f"Memory used during compilation on CPU: {memory_used_mb:.2f} MB")

In [None]:
"""
LLMPipeline - takes care of compilation requirements specific to device, especially static size requirement for NPU
start_chat - used here to transfer context from one device to other. Context includes all information including KV cache etc
For generation on CPU, GPU and NPU individually, does not require start_chat. But used it to obtain metrics of prefill and decode time
"""

In [None]:
def generate_text_with_context(input_text, gpu_pipe, npu_pipe, tokens_to_generate=50):
    # Generate tokens using the GPU
    gpu_generated_text = gpu_pipe.generate(input_text, max_new_tokens=1)

    # Transfer context (hidden states, memory) from GPU to NPU
    npu_pipe.start_chat(system_message=gpu_generated_text)

    # Generate additional tokens using the NPU
    npu_generated_text = npu_pipe.generate(input_text, max_new_tokens=tokens_to_generate)

    return gpu_generated_text, npu_generated_text

def perform_prefill_and_decode(input_text, gpu_pipe, npu_pipe, tokens_to_generate=25):
    # Prefill stage (GPU)
    start_time = time.time()
    gpu_generated_text = gpu_pipe.generate(input_text, max_new_tokens=1)
    end_time = time.time()
    prefill_time = end_time - start_time

    # Transfer context from GPU to NPU
    start_time = time.time()
    npu_pipe.start_chat(system_message=gpu_generated_text)
    end_time = time.time()
    transfer_time = end_time - start_time

    # Decode stage (NPU)
    start_time = time.time()
    npu_generated_text = npu_pipe.generate(input_text, max_new_tokens=tokens_to_generate)
    end_time = time.time()
    decode_time = end_time - start_time
        
    return gpu_generated_text, npu_generated_text, prefill_time, transfer_time, decode_time

def perform_cpu(input_text, cpu_pipe, tokens_to_generate=25):
    
    # Prefill stage
    start_time = time.time()
    cpu_generated_text = cpu_pipe.generate(input_text, max_new_tokens=1)
    end_time = time.time()
    prefill_time = end_time - start_time
    
    # Maintain context
    start_time = time.time()
    cpu_pipe.start_chat(system_message=cpu_generated_text)
    end_time = time.time()
    transfer_time = end_time - start_time

    # Decode stage
    start_time = time.time()
    cpu_generated_text = cpu_pipe.generate(input_text, max_new_tokens=tokens_to_generate)
    end_time = time.time()
    decode_time = end_time - start_time
        
    return cpu_generated_text, prefill_time, transfer_time, decode_time

def perform_gpu(input_text, gpu_pipe, tokens_to_generate=25):
    
    # Prefill stage
    start_time = time.time()
    gpu_generated_text = gpu_pipe.generate(input_text, max_new_tokens=1)
    end_time = time.time()
    prefill_time = end_time - start_time
    
    # Maintain context
    start_time = time.time()
    gpu_pipe.start_chat(system_message=gpu_generated_text)
    end_time = time.time()
    transfer_time = end_time - start_time

    # Decode stage
    start_time = time.time()
    gpu_generated_text = gpu_pipe.generate(input_text, max_new_tokens=tokens_to_generate)
    end_time = time.time()
    decode_time = end_time - start_time
        
    return gpu_generated_text, prefill_time, transfer_time, decode_time

def perform_npu(input_text, npu_pipe, tokens_to_generate=25):
    
    # Prefill stage
    start_time = time.time()
    npu_generated_text = npu_pipe.generate(input_text, max_new_tokens=1)
    end_time = time.time()
    prefill_time = end_time - start_time
    
    # Maintain context
    start_time = time.time()
    npu_pipe.start_chat(system_message=npu_generated_text)
    end_time = time.time()
    transfer_time = end_time - start_time

    # Decode stage
    start_time = time.time()
    npu_generated_text = npu_pipe.generate(input_text, max_new_tokens=tokens_to_generate)
    end_time = time.time()
    decode_time = end_time - start_time
        
    return npu_generated_text, prefill_time, transfer_time, decode_time

In [None]:
# Get metrics on NPU
input_text = "The Sun is yellow because"
start_memory_mb = process.memory_info().rss / (1024 * 1024)
npu_generated_text, prefill_time, transfer_time, decode_time = perform_npu(input_text, pipe_npu)
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
pf_time = []
dc_time = []
tf_time = []
out_tokens = []
pf_time.append(prefill_time)
dc_time.append(decode_time)
tf_time.append(transfer_time)
print("NPU Generated Text: ")
print(npu_generated_text)
out_tokens.append(len(npu_generated_text.split()))
for _ in range(10):
    npu_generated_text, prefill_time, transfer_time, decode_time = perform_npu(input_text, pipe_npu)
    pf_time.append(prefill_time)
    dc_time.append(decode_time)
    tf_time.append(transfer_time)
    out_tokens.append(len(npu_generated_text.split()))




In [None]:
"""
Current metrics of total input and output tokens is approximate as split() is being used
Should change it to get accurate values
"""

In [None]:
print(f"Memory used {memory_used_mb:.2f} MB") 
print("Prefill")
print(statistics.mean(pf_time))
print("Transfer")
print(statistics.mean(tf_time))
print("Decode")
print(statistics.mean(dc_time))
print("Generated tokens")
print(statistics.mean(out_tokens))
avg_tokens_per_second = statistics.mean(out_tokens) / (statistics.mean(pf_time) + statistics.mean(dc_time) + statistics.mean(tf_time))
print("Average tokens/second")
print(avg_tokens_per_second)

In [None]:
# Get metrics on GPU
input_text = "The Sun is yellow because"
start_memory_mb = process.memory_info().rss / (1024 * 1024)
gpu_generated_text, prefill_time, transfer_time, decode_time = perform_gpu(input_text, pipe_gpu)
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print("GPU Generated Text: ")
print(gpu_generated_text)
pfg_time = []
dcg_time = []
tfg_time = []
out_tokens = []
pfg_time.append(prefill_time)
dcg_time.append(decode_time)
tfg_time.append(transfer_time)
out_tokens.append(len(gpu_generated_text.split()))
for _ in range(10):
    gpu_generated_text, prefill_time, transfer_time, decode_time = perform_gpu(input_text, pipe_gpu)
    pfg_time.append(prefill_time)
    dcg_time.append(decode_time)
    tfg_time.append(transfer_time)
    out_tokens.append(len(gpu_generated_text.split()))
print(f"Memory used {memory_used_mb:.2f} MB") 
print("Prefill")
print(statistics.mean(pfg_time))
print("Transfer")
print(statistics.mean(tfg_time))
print("Decode")
print(statistics.mean(dcg_time))
print("Generated tokens")
print(statistics.mean(out_tokens))
avg_tokens_per_second = statistics.mean(out_tokens) / (statistics.mean(pfg_time) + statistics.mean(dcg_time) + statistics.mean(tfg_time))
print("Average tokens/second")
print(avg_tokens_per_second)

In [None]:
# Get metrics on CPU
input_text = "The Sun is yellow because"
start_memory_mb = process.memory_info().rss / (1024 * 1024)
cpu_generated_text, prefill_time, transfer_time, decode_time = perform_cpu(input_text, pipe_cpu)
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print("CPU Generated Text: ")
print(cpu_generated_text)
pfc_time = []
dcc_time = []
tfc_time = []
out_tokens = []
pfc_time.append(prefill_time)
dcc_time.append(decode_time)
tfc_time.append(transfer_time)
out_tokens.append(len(cpu_generated_text.split()))
for _ in range(10):
    cpu_generated_text, prefill_time, transfer_time, decode_time = perform_cpu(input_text, pipe_cpu)
    pfc_time.append(prefill_time)
    dcc_time.append(decode_time)
    tfc_time.append(transfer_time)
    out_tokens.append(len(cpu_generated_text.split()))
print(f"Memory used {memory_used_mb:.2f} MB") 
print("Prefill")
print(statistics.mean(pfc_time))
print("Transfer")
print(statistics.mean(tfc_time))
print("Decode")
print(statistics.mean(dcc_time))
print("Generated tokens")
print(statistics.mean(out_tokens))
avg_tokens_per_second = statistics.mean(out_tokens) / (statistics.mean(pfc_time) + statistics.mean(dcc_time) + statistics.mean(tfc_time))
print("Average tokens/second")
print(avg_tokens_per_second)

In [None]:
# Get metrics on CPU+NPU
pipe_npu = ov_genai.LLMPipeline("TinyLlama", "NPU")
pipe_cpu = ov_genai.LLMPipeline("TinyLlama", "CPU")
input_text = "The Sun is yellow because"
start_memory_mb = process.memory_info().rss / (1024 * 1024)
npu_generated_text, cpu_generated_text, prefill_time, transfer_time, decode_time = perform_prefill_and_decode(input_text, pipe_npu, pipe_cpu)
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print("CPU + NPU Generated Text(NPU - Prefill, CPU - Decode): ")
print(cpu_generated_text)
pfcn_time = []
dccn_time = []
tfcn_time = []
out_tokens = []
pfcn_time.append(prefill_time)
dccn_time.append(decode_time)
tfcn_time.append(transfer_time)
out_tokens.append(len(cpu_generated_text.split()))
for _ in range(10):
    npu_generated_text, cpu_generated_text, prefill_time, transfer_time, decode_time = perform_prefill_and_decode(input_text, pipe_npu, pipe_cpu)
    pfcn_time.append(prefill_time)
    dccn_time.append(decode_time)
    tfcn_time.append(transfer_time)
    out_tokens.append(len(cpu_generated_text.split()))
print(f"Memory used {memory_used_mb:.2f} MB") 
print("Prefill")
print(statistics.mean(pfcn_time))
print("Transfer")
print(statistics.mean(tfcn_time))
print("Decode")
print(statistics.mean(dccn_time))
print("Generated tokens")
print(statistics.mean(out_tokens))
avg_tokens_per_second = statistics.mean(out_tokens) / (statistics.mean(pfcn_time) + statistics.mean(dccn_time) + statistics.mean(tfcn_time))
print("Average tokens/second")
print(avg_tokens_per_second)

In [None]:
# Get metrics on CPU+GPU
pipe_gpu = ov_genai.LLMPipeline("TinyLlama", "GPU")
pipe_cpu = ov_genai.LLMPipeline("TinyLlama", "CPU")
input_text = "The Sun is yellow because"
start_memory_mb = process.memory_info().rss / (1024 * 1024)
gpu_generated_text, cpu_generated_text, prefill_time, transfer_time, decode_time = perform_prefill_and_decode(input_text, pipe_gpu, pipe_cpu)
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print("CPU + GPU Generated Text(GPU - Prefill, CPU - Decode): ")
print(cpu_generated_text)
pfcg_time = []
dccg_time = []
tfcg_time = []
out_tokens = []
pfcg_time.append(prefill_time)
dccg_time.append(decode_time)
tfcg_time.append(transfer_time)
out_tokens.append(len(cpu_generated_text.split()))
for _ in range(10):
    gpu_generated_text, cpu_generated_text, prefill_time, transfer_time, decode_time = perform_prefill_and_decode(input_text, pipe_gpu, pipe_cpu)
    pfcg_time.append(prefill_time)
    dccg_time.append(decode_time)
    tfcg_time.append(transfer_time)
    out_tokens.append(len(cpu_generated_text.split()))
print(f"Memory used {memory_used_mb:.2f} MB") 
print("Prefill")
print(statistics.mean(pfcg_time))
print("Transfer")
print(statistics.mean(tfcg_time))
print("Decode")
print(statistics.mean(dccg_time))
print("Generated tokens")
print(statistics.mean(out_tokens))
avg_tokens_per_second = statistics.mean(out_tokens) / (statistics.mean(pfcg_time) + statistics.mean(dccg_time) + statistics.mean(tfcg_time))
print("Average tokens/second")
print(avg_tokens_per_second)

In [None]:
# Get metrics on GPU+NPU
pipe_npu = ov_genai.LLMPipeline("TinyLlama", "NPU")
pipe_gpu = ov_genai.LLMPipeline("TinyLlama", "GPU")
input_text = "The Sun is yellow because"
start_memory_mb = process.memory_info().rss / (1024 * 1024)
gpu_generated_text, npu_generated_text, prefill_time, transfer_time, decode_time = perform_prefill_and_decode(input_text, pipe_gpu, pipe_npu)
end_memory_mb = process.memory_info().rss / (1024 * 1024)
memory_used_mb = end_memory_mb - start_memory_mb
print("GPU + NPU Generated Text(GPU - Prefill, NPU - Decode): ")
print(npu_generated_text)
pfng_time = []
dcng_time = []
tfng_time = []
out_tokens = []
pfng_time.append(prefill_time)
dcng_time.append(decode_time)
tfng_time.append(transfer_time)
out_tokens.append(len(npu_generated_text.split()))
for _ in range(10):
    gpu_generated_text, npu_generated_text, prefill_time, transfer_time, decode_time = perform_prefill_and_decode(input_text, pipe_gpu, pipe_npu)
    pfng_time.append(prefill_time)
    dcng_time.append(decode_time)
    tfng_time.append(transfer_time)
    out_tokens.append(len(npu_generated_text.split()))
print(f"Memory used {memory_used_mb:.2f} MB") 
print("Prefill")
print(statistics.mean(pfng_time))
print("Transfer")
print(statistics.mean(tfng_time))
print("Decode")
print(statistics.mean(dcng_time))
print("Generated tokens")
print(statistics.mean(out_tokens))
avg_tokens_per_second = statistics.mean(out_tokens) / (statistics.mean(pfng_time) + statistics.mean(dcng_time) + statistics.mean(tfng_time))
print("Average tokens/second")
print(avg_tokens_per_second)