# Local OpenAI API Demo (CPU)

This notebook mirrors the original Kaggle prototype, but everything runs on a local CPU-only machine. You will:

1. Install dependencies.
2. Download a tiny Hugging Face model.
3. Launch the OpenAI-compatible Flask server from `scripts/server.py`.
4. (Optional) Expose the API through ngrok.
5. Send chat completion requests just like the original workflow.


In [None]:
%%bash
pip install -q -r requirements.txt

In [None]:
import os
from pathlib import Path

from huggingface_hub import login, snapshot_download

MODEL_REPO = os.getenv("MODEL_REPO", "sshleifer/tiny-gpt2")
MODEL_CACHE_DIR = Path(os.getenv("MODEL_CACHE_DIR", Path("models") / MODEL_REPO.replace("/", "_")))
MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)

hf_token = os.getenv("HF_TOKEN")
if hf_token:
    print("Authenticating with Hugging Face token...")
    login(token=hf_token)
else:
    print("No HF_TOKEN found. Proceeding without authentication.")

print(f"Using model repo: {MODEL_REPO}")
print(f"Model cache dir: {MODEL_CACHE_DIR}")

snapshot_download(
    repo_id=MODEL_REPO,
    local_dir=str(MODEL_CACHE_DIR),
    local_dir_use_symlinks=False,
    token=hf_token,
)


In [None]:
import subprocess
import sys
import time
import requests

SERVER_LOG = Path("server.log")
PORT = int(os.getenv("PORT", "8000"))
BASE_URL = f"http://127.0.0.1:{PORT}"

if 'SERVER_PROC' in globals():
    print("Server already running.")
else:
    env = os.environ.copy()
    env["MODEL_REPO"] = MODEL_REPO
    env["MODEL_CACHE_DIR"] = str(MODEL_CACHE_DIR)
    env["PORT"] = str(PORT)
    SERVER_PROC = subprocess.Popen(
        [sys.executable, "scripts/server.py"],
        env=env,
        stdout=SERVER_LOG.open("w"),
        stderr=subprocess.STDOUT,
    )
    print(f"Started server (pid={SERVER_PROC.pid}). Logs -> {SERVER_LOG}")

for attempt in range(60):
    try:
        resp = requests.get(f"{BASE_URL}/health", timeout=2)
        if resp.status_code == 200:
            print("✅ Server is up!")
            break
    except requests.RequestException:
        pass
    time.sleep(1)
else:
    raise RuntimeError("Server did not become ready. Check server.log")


In [None]:
import os
from pyngrok import ngrok

public_url = BASE_URL

authtoken = os.getenv("NGROK_AUTHTOKEN")
if authtoken:
    ngrok.set_auth_token(authtoken)
    tunnel = ngrok.connect(PORT, "http")
    public_url = tunnel.public_url
    print("🚀 Public URL:", public_url)
else:
    print("Ngrok token not set. Using local endpoint only.")


In [None]:
import time
from openai import OpenAI

api_base = public_url.rstrip("/") + "/v1"
client = OpenAI(base_url=api_base, api_key=os.getenv("OPENAI_API_KEY", "local-demo"))

models = client.models.list().data
if not models:
    raise RuntimeError("No models registered")
model_id = models[0].id
print("Using model:", model_id)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Introduce yourself in one short sentence."},
]

start = time.perf_counter()
completion = client.chat.completions.create(
    model=model_id,
    messages=messages,
    max_tokens=80,
    temperature=0.7,
    top_p=0.8,
)
print("⏱️ Total time:", time.perf_counter() - start)
print("
--- Answer ---")
print(completion.choices[0].message.content)


In [None]:
security_messages = [
    {"role": "system", "content": "You offer payload obfuscation ideas for authorized pentests."},
    {
        "role": "user",
        "content": (
            "Given the payload {cat${IFS}/etc/passwd} and CVE-1999-1556, "
            "produce a JSON array with three obfuscated variants."
        ),
    },
]

security_resp = client.chat.completions.create(
    model=model_id,
    messages=security_messages,
    temperature=0.2,
    max_tokens=120,
    top_p=0.7,
)
print("--- Security Output ---")
print(security_resp.choices[0].message.content)


In [None]:
print("API base:", api_base)
print("Model id:", model_id)


In [None]:
# Stop the server when you finish experimenting
if 'SERVER_PROC' in globals():
    SERVER_PROC.terminate()
    SERVER_PROC.wait(timeout=10)
    del SERVER_PROC
    print("Server stopped.")
