This colab notebook demonstrates how to make use of api calls by passing on user inputs to endpoints as inputs and receiving responses which can be rendered on the frontend.

The methods herein make the following assumptions:


*   You have successfully mounted google drive for use in google colab
*   You have successfully cloned the repository and placed it in a folder named 'ChatDoctorProject', which is contained in GoogleDrive.
After successfully cloning the repository, the file structure should appear as : '/drive/MyDrive/ChatDoctorProject/ChatDoctor/'
*   You have successfully downloaded the model files into a folder named 'pretrained', whose parent folder is 'ChatDoctor'. The model files will therefore be contained in '/drive/MyDrive/ChatDoctorProject/ChatDoctor/pretrained/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
""" Install required dependencies """
!pip install fastapi uvicorn nest_asyncio safetensors pyngrok transformers

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.1-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.4-py3-none-any.whl.metadata (8.7 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.1-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.4-py3-none-any.whl (23 kB)
Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, pyngrok, s

In [None]:
""" cd into the ChatDoctor directory """
%cd /content/drive/MyDrive/ChatDoctorProject/ChatDoctor

/content/drive/MyDrive/ChatDoctorProject/ChatDoctor


In [None]:
""" Create a function that loads the model for use in consequent cells """

import transformers
import torch
model=None
tokenizer=None
generator=None

# Create a function to load the model
def load_model(model_name="./pretrained/", eight_bit=0, device_map="auto"):
    global model, tokenizer, generator

    if device_map == "zero":
        device_map = "balanced_low_0"

    tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
    model = transformers.LlamaForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        load_in_8bit=False,
        cache_dir="cache"
    ).cuda()

    generator = model.generate

load_model() #loads the model once and makes the variables (model, tokenizer, generator) available globally.

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
""" Defines a function that takes in a users prompt, generates and returns a response """

# Define your system prompt for natural remedy advice.
system_prompt = (
    "You are a highly knowledgeable medical assistant who exclusively provides natural remedy advice "
    "using botanical and herbal treatments. You NEVER suggest conventional pharmaceutical treatments. "
    "Answer the user's questions with natural remedies only.\n\n"
)

# Start the conversation history with an initial greeting that reinforces natural remedy focus.
First_chat = "ChatDoctor: I am ChatDoctor. I only provide natural remedy medical suggestions. What natural remedy medical questions do you have?"

history = []
history.append(First_chat)
async def chat(prompt:str)->str:
  invitation = "ChatDoctor: "
  human_invitation = "Patient: "

  history.append(human_invitation + prompt)

  fulltext = (
        system_prompt +                     # <-- Added system prompt here.
        "\n\n".join(history) + "\n\n" +
        invitation
    )

  generated_text = ""
  gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda()
  in_tokens = len(gen_in)
  with torch.no_grad():
          generated_ids = generator(
              gen_in,
              max_new_tokens=200,
              use_cache=True,
              pad_token_id=tokenizer.eos_token_id,
              num_return_sequences=1,
              do_sample=True,
              repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
              temperature=0.5, # default: 1.0
              top_k = 50, # default: 50
              top_p = 1.0, # default: 1.0
              early_stopping=True,
          )
          generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

          text_without_prompt = generated_text[len(fulltext):]
  response = text_without_prompt
  response = response.split(human_invitation)[0]
  response.strip()
  history.append(invitation + response)
  return response

In [None]:
""" Create a FastAPI application """
# Create a fastapi application instance
from fastapi import FastAPI

app=FastAPI()

In [None]:
""" Create the required endpoints """

@app.get('/') #default homepage displayed on the home route
def home():
  print("Homepage")
  return {"detail":"This is the ChatDoctor homepage.."}

@app.post('/ask') #end point used to pass a prompt and and return the LLMs response
async def ask(data:dict):
  """
  This endpoint takes in a dict, passed as a json object,
  calls the chat function as described above and
  awaits for the response before returning the response as a json object
  """
  response=await chat(prompt=data["prompt"])
  return {"detail":response}

""" Create more endpoints as required """

' Create more endpoints as required '

In [8]:
""" Run the app to access the endpoints on any device using ngrok """

from pyngrok import ngrok
import uvicorn
import nest_asyncio

!ngrok config add-authtoken "2vFXG8AxzFYsAgAiGy94BkVSblY_w6BbWYnTQRCjWm1KgNiw"

# Allow running uvicorn in Jupyter
nest_asyncio.apply()

 # Setup a tunnel to the app
public_url = ngrok.connect(8000)
print(public_url) #use this url on any program(frontend-application, postman etc..) to access the apis

# run the app using uvicorn
uvicorn.run(app=app, port=8000)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


INFO:     Started server process [363]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


NgrokTunnel: "https://c718-34-87-79-119.ngrok-free.app" -> "http://localhost:8000"




INFO:     2001:16a2:f471:e800:b861:500e:1dfa:e7fc:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     2001:16a2:f471:e800:b861:500e:1dfa:e7fc:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     2001:16a2:f471:e800:b861:500e:1dfa:e7fc:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     2001:16a2:f471:e800:b861:500e:1dfa:e7fc:0 - "POST /ask HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [363]
