Followed the advice of [llama-recipes](https://github.com/facebookresearch/llama-recipes/tree/main/demo_apps) , [TheBloke Llama 7B](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF), [FastAPI project](https://github.com/pinecone-io/examples/blob/master/learn/generation/langchain/handbook/09-langchain-streaming/main.py) and [Langchain](https://python.langchain.com/docs/integrations/llms/llamacpp) on implementing this code.

In [1]:
# @title Set os environment keys
import os
os.environ["NGROK_AUTH_TOKEN"] = # Add your authentication token here 😅

In [2]:
# @title Install depedencies

# These exist just to make llama-cpp-python install without any errors
%pip install uvicorn tiktoken openai cohere unicorn python-multipart kaleido fastapi -q

# For downloading the model form huggingface
%pip install huggingface_hub -q

# For querying framework
%pip install langchain langchainhub -q


# More models in the "Provided Files" section of https://huggingface.co/TheBloke/CodeLlama-7B-GGUF.


# @markdown Choose either GPU or CPU installation. You should most of the time choose GPU because it is much faster.
computing_type = "cpu" # @param ["gpu", "cpu"] {type:"string"}
if computing_type == "cpu":
    # CPU INSTALLATION
    %pip install --upgrade --quiet  llama-cpp-python
    !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

    # Reinstall cpu instead
    # !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --upgrade --force-reinstall llama-cpp-python --no-cache-dir

elif computing_type == "gpu":
    # GPU INSTALLATION
    !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python


#ngrok
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null && echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list && sudo apt update && sudo apt install ngrok

# For the framework
%pip install FastAPI pyngrok -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# @title Select the LLM model
# @markdown Select a suitable model:
model_name = "llama-2-7b-chat.Q4_0.gguf" # @param ["llama-2-13b-chat.Q5_K_M.gguf", "llama-2-7b-chat.Q4_0.gguf"] {type:"string"}


if model_name == "llama-2-13b-chat.Q5_K_M.gguf":
  # 13B model
  huggingface_repository = "TheBloke/Llama-2-13B-chat-GGUF"
elif model_name == "llama-2-7b-chat.Q4_0.gguf":
  # 7B model
  huggingface_repository = "TheBloke/Llama-2-7b-Chat-GGUF"

In [4]:
# @title Download the LLM model
!huggingface-cli download {huggingface_repository} {model_name} --local-dir . --local-dir-use-symlinks False

Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
downloading https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_0.gguf to /root/.cache/huggingface/hub/tmpmb17yo5r
llama-2-7b-chat.Q4_0.gguf: 100% 3.83G/3.83G [01:05<00:00, 58.8MB/s]
./llama-2-7b-chat.Q4_0.gguf


Next we will setup the LLM and the fastapi service to be hosted on ngrok

In [5]:
# Lanchain LLM libraries
from langchain.callbacks.manager import CallbackManager
from langchain.chains import LLMChain
from langchain.schema.output import LLMResult
from langchain.callbacks.streaming_aiter import AsyncIteratorCallbackHandler
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

In [6]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
model_path="/content/" + model_name

In [7]:
# Code from https://python.langchain.com/docs/integrations/llms/llamacpp
if computing_type == "cpu":
    # CPU INSTALLATION

    llm = LlamaCpp(
        model_path="/content/" + model_name,
        temperature=0.75,
        max_tokens=2000,
        n_ctx=2048,
        top_p=1,
        callback_manager=callback_manager,
        streaming=True,
        verbose=True,  # Verbose is required to pass to the callback manager
    )
elif computing_type == "gpu":
    # Callbacks support token-wise streaming

    n_gpu_layers = -1   # Change this value based on your model and your GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path="/content/" + model_name,
        temperature=0.75,
        max_tokens=2000,
        n_ctx=4096,
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        callback_manager=callback_manager,
        verbose=True,  # Verbose is required to pass to the callback manager
    )

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /content/llama-2-7b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32         

In [None]:
# @title Create template and test model
test_response = False # @param ["True", "False"] {type:"raw"}
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

if test_response:
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
  response = llm.invoke(question)

Note: The agent type model is still somewhat broken, which is likely due to the too small LLM size. This is why it is recommended to select llm instead of agent.

In [None]:
# @title Select model type for streaming.
model_type = "llm" # @param ["agent", "llm"] {type:"string"}


In [None]:
# @title Load the API model into memory.

# Agent code from https://python.langchain.com/docs/modules/agents/how_to/handle_parsing_errors
from enum import Enum
import asyncio
from typing import Annotated, List, Union, Dict, Any
from fastapi import FastAPI, Query, Body, HTTPException
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from pydantic import BaseModel

import datetime
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from contextlib import asynccontextmanager
from langchain.agents import AgentType, initialize_agent, create_structured_chat_agent, AgentExecutor, create_react_agent
from langchain.memory import ConversationBufferWindowMemory
from langchain import hub

app = FastAPI()

# Add CORS
origin = ['*']
app.add_middleware(
    CORSMiddleware,
    allow_origins=origin,
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*']
)




class Message(BaseModel):
    content: str

if model_type == "llm":
  class AsyncCallbackHandler(AsyncIteratorCallbackHandler):
    content: str = ""

    def __init__(self) -> None:
      super().__init__()

    async def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
      self.content += token
      self.queue.put_nowait(token)

    async def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
      self.content = ""
      self.done.set()

  async def runCall(query: str, stream_it: AsyncCallbackHandler):
    llm.callbacks =[stream_it]
    response = await llm.ainvoke(query)

  async def createGen(query: str, stream_it: AsyncCallbackHandler):
    task = asyncio.create_task(runCall(query, stream_it))
    async for token in stream_it.aiter():
        yield token
    await task

elif model_type == "agent":
  memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    k=5,
    return_messages=True,
    output_key="output"
  )
  prompt = hub.pull("hwchase17/structured-chat-agent")
  # api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=100)
  # tool = WikipediaQueryRun(api_wrapper=api_wrapper)
  # tools = [get_items, where_cat_is_hiding]
  # agent = create_react_agent(llm, tools, prompt)
  # agent_executor = AgentExecutor(
  #     agent=agent, tools=tools, verbose=True, handle_parsing_errors=True
  # ).with_config({"run_name": "Agent"})

  # tools = ... # ADD TOOLS HERE
  # create_structured_chat_agent()
  # agent = create_structured_chat_agent(llm=llm, tools=tools, prompt=prompt)
  # agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, early_stopping_method="generate",
  #                                memory=memory, return_intermediate_steps=False)

  # TODO: Replace with a non depricated tool. The code above would be a start as long as you replace tools.
  agent = initialize_agent(
      agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
      tools=[],
      llm=llm,
      verbose=True,
      max_iterations=3,
      early_stopping_method="generate",
      memory=memory,
      return_intermediate_steps=False,
  )

  class AsyncCallbackHandler(AsyncIteratorCallbackHandler):
    content: str = ""
    final_answer: bool = False
    def __init__(self) -> None:
      super().__init__()

    async def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
      self.content += token
      # if we passed the final answer, we put tokens in queue
      self.queue.put_nowait(token)
      if self.final_answer:
          if '"action_input": "' in self.content:
              if token not in ['"', "}"]:
                  print(f"Token: {token}")
                  # self.queue.put_nowait(token)
      elif "Final Answer" in self.content:
          # print("----Final answer!!-----")
          self.final_answer = True
          self.content = ""

    async def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
      # print("\n\n")
      # print("------------------------------------")
      # print("END CONTENT PRINT OF ALL TOKENS:")
      # print("------------------------------------")
      # print(self.content)
      # print("------------------------------------")
      # print("\n\n")
      if self.final_answer:
            self.content = ""
            self.final_answer = False
            self.done.set()
      else:
          self.content = ""

  async def runCall(query: str, stream_it: AsyncCallbackHandler):
    agent.agent.llm_chain.llm.callbacks =[stream_it]
    # response = await agent_executor.acall(inputs={"input": query})
    response = await agent.acall(inputs={"input": query})

  async def createGen(query: str, stream_it: AsyncCallbackHandler):
    task = asyncio.create_task(runCall(query, stream_it))
    async for token in stream_it.aiter():
        yield token
    await task




@app.post("/query/")
async def get_response(time: datetime.datetime,
                      query: Message = ...):
  stream_it = AsyncCallbackHandler()
  gen = createGen(query.content, stream_it)
  return StreamingResponse(gen, media_type="text/event-stream")


@app.get("/health")
async def get_health():
  return {"Still here :)"}

@asynccontextmanager
async def lifespan(app: FastAPI):
   port = app.port
   print("The port used for this app is", port)

In [None]:
# Put your ngrok authentication token here https://dashboard.ngrok.com/get-started/your-authtoken
!ngrok config add-authtoken {os.environ["NGROK_AUTH_TOKEN"]}

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
import nest_asyncio
from pyngrok import ngrok
# import ngrok
import uvicorn

ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)


INFO:     Started server process [5954]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://fbbd-34-106-109-56.ngrok-free.app
INFO:     185.76.9.58:0 - "POST /query/?time=2024-02-24T16%3A33%3A17.093021 HTTP/1.1" 200 OK



llama_print_timings:        load time =    5462.46 ms
llama_print_timings:      sample time =     139.11 ms /   206 runs   (    0.68 ms per token,  1480.84 tokens per second)
llama_print_timings: prompt eval time =    5462.36 ms /     7 tokens (  780.34 ms per token,     1.28 tokens per second)
llama_print_timings:        eval time =  170568.18 ms /   205 runs   (  832.04 ms per token,     1.20 tokens per second)
llama_print_timings:       total time =  177129.07 ms /   212 tokens


In [None]:
# Use this to kill rogue ngrok instances
# !killall ngrok