<a href="https://colab.research.google.com/github/SARA3SAEED/abu-LLM/blob/main/0_abu_part_01_llm_proxy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install + Run Ollama Server

In [None]:
!curl https://ollama.ai/install.sh | sh

In [None]:
ollama_model_id = "qwen2:7b-instruct-q4_K_M"

In [None]:
!nohup ollama serve &
!sleep 8 && tail nohup.out

In [None]:
!ollama pull {ollama_model_id}

In [None]:
!nohup ollama run {ollama_model_id} &
!sleep 8 && tail nohup.out

### Test Ollama

In [None]:
!pip install ollama==0.3.2

In [None]:
import ollama
import requests
import json
from pprint import pprint

#### Using Python Requests

In [None]:
resp = requests.post("http://localhost:11434/api/generate", json={
    "model": ollama_model_id,
    "prompt": "Say Hi in French",
    "stream": False
})

if resp.status_code != 200:
    raise Exception("Non-200 response: " + str(resp.content))
else:
    pprint(resp.json())

#### Using Ollama-Python

In [None]:
response = ollama.chat(model=ollama_model_id, messages=[
  {
    'role': 'user',
    'content': "كيف يمكنني السؤال عن أقرب متجر باللغة الفرنسية؟",
  },
])

pprint(response['message']['content'])

In [None]:
import ollama

stream = ollama.chat(
    model=ollama_model_id,
    messages=[{'role': 'user', 'content': 'اذكر خمس أنواع فواكه تحتوي على نسب عالية من فيتامين ج؟'}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

## LLM Proxy

In [None]:
# ============ check any litellm processes
# !pgrep -fl litellm

# ============ kill any litellm processes
# !pkill -f litellm

In [None]:
!pip install 'litellm[proxy]'==1.44.9 openai==1.42.0

In [None]:
import os
from google.colab import userdata
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')

In [None]:
%%writefile llm.config
model_list:
  - model_name: "qwen2:7b-ollama"
    litellm_params:
      model: "ollama/qwen2:7b-instruct-q4_K_M"
      api_base: http://localhost:11434

  - model_name: "groq-gemma9b"
    litellm_params:
      model: "groq/gemma2-9b-it"
      api_key: "os.environ/GROQ_API_KEY"

  - model_name: "groq-mixtral"
    litellm_params:
      model: "groq/mixtral-8x7b-32768"
      api_key: "os.environ/GROQ_API_KEY"

In [None]:
!nohup litellm --config llm.config &
!sleep 8 && tail nohup.out

In [None]:
import openai
from pprint import pprint

client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
)

# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="groq-gemma9b", messages = [
    {
        "role": "user",
        "content": "لماذا تبدو السماء زرقاء بالنهار؟"
    }],
    temperature=0.5,
    max_tokens=512
)

if response and response.choices:
    print("model:", response.model)
    pprint(response.choices[0].message.content)


In [None]:
import openai
from pprint import pprint

client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
)

# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="groq-gemma9b", messages = [
    {
        "role": "user",
        "content": "لماذا تبدو السماء زرقاء بالنهار؟"
    }
])

if response and response.choices:
    print("model:", response.model)
    pprint(response.choices[0].message.content)


## LiteLLM Logs

In [None]:
import litellm
from litellm.integrations.custom_logger import CustomLogger
from litellm import completion, acompletion
import os
import json

logs_dir = "./llm-logs"
os.makedirs(logs_dir, exist_ok=True)


def log_post_api_call(  kwargs,                 # kwargs to completion
                        completion_response,    # response from completion
                        start_time, end_time    # start/end time
                        ):
    with open(os.path.join(logs_dir, "post-llm-call.jsonl"), "a") as dest:
        dest.write(json.dumps({
            "kwargs": kwargs,
            "completion_response": completion_response,
            "start_time": start_time,
            "end_time": end_time,
        }, default=str, ensure_ascii=False) + "\n" )

litellm.success_callback = [log_post_api_call]

response = completion(model="groq/gemma2-9b-it", messages = [
                        {
                            "role": "user",
                            "content": "لماذا تبدو السماء زرقاء بالنهار؟"
                        }
                    ], stream=False)

if response and response.choices:
    print("model:", response.model)
    pprint(response.choices[0].message.content)

# =========== in case of stream
# for chunk in response:
#     print(chunk['choices'][0]['delta'].content, end='', flush=True)
#     continue

### LiteLLM Load Balancer

In [None]:
%%writefile llm-lb.config
model_list:
  - model_name: "myapp-llm"
    litellm_params:
      model: "ollama/qwen2:7b-instruct-q4_K_M"
      api_base: http://localhost:11434
      rpm: 2

  - model_name: "myapp-llm"
    litellm_params:
      model: "groq/gemma2-9b-it"
      api_key: "os.environ/GROQ_API_KEY"
      rpm: 5

  - model_name: "myapp-llm"
    litellm_params:
      model: "groq/mixtral-8x7b-32768"
      api_key: "os.environ/GROQ_API_KEY"
      rpm: 5

routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy",]

Writing llm-lb.config


In [None]:
!nohup litellm --config llm-lb.config &
!sleep 8 && tail nohup.out

nohup: appending output to 'nohup.out'
[GIN] 2024/08/29 - 04:28:35 | 200 |   6.54177999s |       127.0.0.1 | POST     "/api/generate"
INFO:     127.0.0.1:37062 - "POST /chat/completions HTTP/1.1" 200 OK
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [9826]
INFO:     Started server process [11726]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:4000 (Press CTRL+C to quit)


In [None]:
import openai
from pprint import pprint

client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
)

# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="myapp-llm", messages = [
    {
        "role": "user",
        "content": "لماذا تبدو السماء زرقاء بالنهار؟"
    }
])

if response and response.choices:
    print("model:", response.model)
    pprint(response.choices[0].message.content)


model: ollama/qwen2:7b-instruct-q4_K_M
('السماء تبدو زرقاء بسبب عملية تناثر الضوء في الغلاف الجوي لل зم. عندما يدخل '
 'الأشعة الشمسية الكرة ، تنكسر جزء من الضوء البنفسجي واليابس بشكل أكبر من '
 'الألوان الأخرى بسبب بقائها الطويل في الغلاف الجوي. لكن الفعل الأكثر أهمية هو '
 'قدرة الغازات مثل الأوكسجين والأوزون والنيتروجين على تنثر الضوء بكفاءة في '
 'جميع الاتجاهات. هذا التشتت يضيف اللون الزرقاء إلى كل من الأشعة تحت الحمراء '
 'وأجزاء أخرى من الطيف، مما يجعل السماء تبدو باللون الزرقاء لمعظم الوقت.')


### Expose LiteLLM Port

In [None]:
!pip install pyngrok==7.2.0

In [None]:
from google.colab import userdata
from pyngrok import ngrok, conf

# Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
conf.get_default().auth_token = userdata.get('ngrok')

port = "4000"

# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(port).public_url
print(public_url)

https://aed5-34-124-211-231.ngrok-free.app


### Run Colab Terminal

In [None]:
!pip install google-colab-shell==0.2

In [None]:
from google_colab_shell import getshell

In [None]:
getshell(height=600)

In [None]:
getshell(height=400)