In [None]:
# FULL SETUP: L3-8B-Stheno-v3.2 (Q5_K_M) on Colab T4 + ngrok
# → Make sure you selected **T4 GPU** under Runtime → Change runtime type
# → Make sure you added the required secrets in google collab's secrets menu.
# → RESERVED_DOMAIN = your ngrok reserved domain (should be something like certain-blalbalba-basik-ngrok-free-app)
# → NGROK_TOKEN = your ngrok token that you get from ngrok dashboard
# If NGROK_TOKEN doesn't work, try NGROK_AUTHTOKEN
# CELL 1 — Verify GPU (should show Tesla T4 + CUDA 12.4)
!nvidia-smi

In [None]:
# CELL 2 — Install everything (llama-cpp-python CUDA + server + huggingface + ngrok)
!pip install --no-cache-dir \
    llama-cpp-python[server] \
    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

!pip install huggingface_hub[hf_transfer] pyngrok

In [None]:
# CELL 3 — Download the  file (L3-8B-Stheno-v3.2-Q5_K_M-imat.gguf)
from huggingface_hub import hf_hub_download
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

repo_id = "Lewdiculous/L3-8B-Stheno-v3.2-GGUF-IQ-Imatrix"

# Choose one of these (recommended order):
#   Q5_K_M-imat  → best quality/size balance  (~5.3 GB, ~28–32 t/s on T4)
#   Q5_K_S-imat  → slightly smaller/faster, tiny quality drop
#   Q6_K-imat    → noticeably better but ~6.1 GB
#   Q8_0-imat    → reference quality but ~8.5 GB (still fits T4)

chosen_file = "L3-8B-Stheno-v3.2-Q5_K_M-imat.gguf"   # ← change here if you want another

model_path = hf_hub_download(
    repo_id=repo_id,
    filename=chosen_file,
    local_dir="/content/models",
)

print(f"Model successfully downloaded: {chosen_file}")
print(f"Path: {model_path}")
print(f"Size: {os.path.getsize(model_path)/1e9:.2f} GB")

In [None]:
# CELL 4 — Start the OpenAI-compatible server in the background
import subprocess
import time
import os

# Full GPU offload (-1 = all layers)
server_cmd = [
    "python", "-m", "llama_cpp.server",
    "--model", model_path,
    "--n_gpu_layers", "-1",
    "--n_ctx", "8192", # Change context size based on your LLM Choice.
    "--host", "0.0.0.0",
    "--port", "8000",
    "--n_batch", "512",
    "--verbose", "true" # Changed to true for debugging output
]

# Run server in background
# We'll capture its stdout/stderr to files for better debugging if it crashes.
server_log_file = "/tmp/llama_cpp_server.log"
with open(server_log_file, "w") as log_file:
    process = subprocess.Popen(server_cmd, stdout=log_file, stderr=log_file)

# Wait for the server to start, with retries
max_retries = 15 # Increased retries for larger models
retry_delay = 5 # seconds between retries
server_ready = False

print("Waiting for llama-cpp-python server to start...")
for i in range(max_retries):
    print(f"Attempt {i+1}/{max_retries} to connect to server...")
    try:
        # Use subprocess.run to capture output of curl to check server status
        # `check=False` allows curl to fail without raising an exception immediately
        curl_check = subprocess.run(
            ["curl", "-s", "http://0.0.0.0:8000/v1/models"],
            capture_output=True, text=True, check=False
        )
        # Check if the curl command was successful and returned expected content
        if curl_check.returncode == 0 and "object" in curl_check.stdout:
            print("Server is up and running!")
            server_ready = True
            break
        else:
            print(f"Server not yet ready. Curl exit code: {curl_check.returncode}. Output (truncated): {curl_check.stderr.strip()[:200] or curl_check.stdout.strip()[:200]}")
            time.sleep(retry_delay)
    except Exception as e:
        print(f"Error during curl check: {e}")
        time.sleep(retry_delay)

if not server_ready:
    print("\nERROR: Server failed to start within the expected time.")
    print(f"Please check the server logs in {server_log_file} for more details.")
    # Attempt to terminate the process if it's still running
    if process.poll() is None: # If process is still running
        process.terminate()
        print("Background server process terminated.")
else:
    print("\nServer running on http://localhost:8000")
    # Final sanity check with curl for user confirmation
    !curl http://0.0.0.0:8000/v1/models


In [None]:
# CELL 4.5 - If CELL 4 wasn't able to run, check the logs by running this cell
import os

server_log_file = "/tmp/llama_cpp_server.log"

if os.path.exists(server_log_file):
    print(f"--- Contents of {server_log_file} ---")
    with open(server_log_file, "r") as f:
        print(f.read())
    print(f"--- End of {server_log_file} ---")
else:
    print(f"Server log file not found at {server_log_file}")

In [None]:
# CELL 5 — Expose with your RESERVED ngrok domain (always the same URL)
from google.colab import userdata
from pyngrok import ngrok
import time

# Your ngrok auth token
ngrok.set_auth_token(userdata.get('NGROK_TOKEN'))

# Your ngrok reserved domain
RESERVED_DOMAIN=(userdata.get('RESERVED_DOMAIN'))

# Kill any old tunnels on port 8000 just in case
for tunnel in ngrok.get_tunnels():
    if tunnel.config.get("addr") == "localhost:8000":
        ngrok.disconnect(tunnel.public_url)

# Open the tunnel with your exact reserved domain
tunnel = ngrok.connect(
    addr="8000",
    proto="http",
    bind_tls=True,
    domain=RESERVED_DOMAIN          # ← this locks it to your domain
)

public_url = tunnel.public_url
print("\nYOUR PUBLIC ENDPOINT (OpenAI compatible):")
print(public_url)
print("\nExample curl:")
print(f'curl {public_url}/v1/chat/completions \\\n'
      '  -H "Content-Type: application/json" \\\n'
      '  -d \'{{"model":"llama","messages":[{{"role":"user","content":"Hello Stheno!"}}],"temperature":0.8}}\'')

print("\nServer is live at your permanent URL above!")
print("You can close the notebook — the tunnel stays alive as long as this cell keeps running.\n")

# Keep the cell alive forever
while True:
    time.sleep(60)