## Ollama on Colab

Notebook construido com a referencia:

https://medium.com/@neohob/run-ollama-locally-using-google-colabs-free-gpu-49543e0def31

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh
!pip -q install aiohttp python-dotenv ngrok

import os
import asyncio
from google.colab import userdata
from IPython.display import display, clear_output
from dotenv import load_dotenv
import ipywidgets as widgets
import ngrok

In [None]:
# !nvidia-smi

In [None]:
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc \
	| sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null \
	&& echo "deb https://ngrok-agent.s3.amazonaws.com buster main" \
	| sudo tee /etc/apt/sources.list.d/ngrok.list \
	&& sudo apt update \
	&& sudo apt install ngrok

# print("Proceed to cuda drivers installation, this may take a while...")
# !echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
# !sudo apt-get update && sudo apt-get install -y cuda-drivers

clear_output()
print("Setup done!")


In [None]:
# Verify ngrok installation
!which ngrok


Se ouver um arquivo .env com as configurações, elas serão pegas dele, senão o input deve ser feito.

In [None]:
# Load environment variables from .env file
load_dotenv()

# Retrieve the NGROK_AUTH_TOKEN from environment variables
ngrok_auth_token = os.getenv('NGROK_AUTH_TOKEN')

# Check if the token is available, otherwise ask the user
if not ngrok_auth_token:
    token_input = input("Enter your ngrok auth token: ")
    ngrok_auth_token = token_input

In [None]:
# Configure ngrok with the auth token
!ngrok config add-authtoken {ngrok_auth_token}
ngrok.set_auth_token(ngrok_auth_token)

In [None]:
import os
import subprocess
import time

# Set LD_LIBRARY_PATH so the system NVIDIA library
os.environ.update({'LD_LIBRARY_PATH': '/usr/lib64-nvidia'})

# Run processes in the background
ollama_process = subprocess.Popen(['ollama', 'serve'])

# Display output to confirm processes started
print(" > ollama process started")


O modelo a ser utilizado:

In [None]:
llm_model = "llama3"

In [None]:
print("starting the download of the model, this may take a while...")
time.sleep(10) # wait 10s to give the process a time to start
!ollama pull {llm_model}

clear_output()
print(f"{llm_model} model downloaded")

In [None]:
# avaliable models at the api right now
!ollama list

Making a endpoint to use the Ollama API from outside the colab

In [None]:
ngrok_domain = os.getenv('NGROK_DOMAIN')

if not ngrok_domain:
    domain_input = input("Enter your ngrok domain (empty for none): ")
    ngrok_domain = domain_input

In [None]:
if ngrok_domain and ngrok_domain != "":
  listener = ngrok.forward(11434,
    logs="stdout",
    domain=ngrok_domain,
    authtoken_from_env=True,
    request_header_add="host:localhost",
    )
else:
  listener = ngrok.connect(11434,
                           logs="stdout",
                           authtoken_from_env=True,
                           request_header_add="host:localhost",
                           )

ngrok_connection = await listener
url = ngrok_connection.url()
print(f"Connection URL: {url}")

starting the model

In [None]:
llm_process = subprocess.Popen(['ollama', 'run', llm_model])

In [None]:
!curl http://localhost:11434

In [None]:
input("Finalizar conexão ngrok")
await ngrok_connection.close()