In [31]:
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [39]:
template = """Question: {question}

Answer: Answer nicely."""
prompt = PromptTemplate.from_template(template)

In [33]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [46]:
model_path = 'models/llama-2.gguf'
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=-1,
    temperature=0.5,
    max_tokens=500,
    n_batch = 256,
    f16_kv=True,
    callback_manager=callback_manager,
    verbose=True
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from models/llama-2.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32
llama

In [47]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [50]:
q = "Your name is Umbrella, introduce yourself."
res = llm_chain.invoke(q)
print(res['text'])

Llama.generate: prefix-match hit


 Hello there! *adjusts sunglasses* My name is indeed Umbrella, and I'm here to brighten up your day. 😊 As a stylish and functional umbrella, I'm always ready to protect you from the rain or just add some fun to your outfit. What can I do for you today? 🌂✨


llama_print_timings:        load time =    8268.34 ms
llama_print_timings:      sample time =      16.36 ms /    85 runs   (    0.19 ms per token,  5194.65 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   14101.16 ms /    85 runs   (  165.90 ms per token,     6.03 tokens per second)
llama_print_timings:       total time =   14825.69 ms /    86 tokens


 Hello there! *adjusts sunglasses* My name is indeed Umbrella, and I'm here to brighten up your day. 😊 As a stylish and functional umbrella, I'm always ready to protect you from the rain or just add some fun to your outfit. What can I do for you today? 🌂✨
