# üé≠ AI Companion: Universal Roleplay Bridge (Threaded)

**FIXED:** This version runs the server in a background thread to avoid Colab event loop conflicts.

### Instructions:
1. **Runtime:** `Runtime` > `Change runtime type` > **T4 GPU**.
2. **Ngrok:** Paste your token in Cell 3.
3. **Run All:** Press `Ctrl + F9`.

In [1]:
from google.colab import userdata

In [2]:
# @title 1. Install Dependencies
!pip install -q -U fastapi uvicorn pyngrok nest_asyncio requests==2.32.4
!pip install -q -U transformers accelerate bitsandbytes torch==2.9.0

In [3]:
# @title 2. Load Roleplay Specialist (Stheno-v3.2)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig
from threading import Thread

# --- AUTH ---
HF_TOKEN = userdata.get('HF_TOKEN')
# ------------

model_id = "Sao10K/L3-8B-Stheno-v3.2"

print(f"Loading {model_id}...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print(f"\n‚úÖ Roleplay Specialist LOADED on {torch.cuda.get_device_name(0)}!")

Loading Sao10K/L3-8B-Stheno-v3.2...


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]


‚úÖ Roleplay Specialist LOADED on Tesla T4!


In [None]:
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import uvicorn, nest_asyncio, re, os, time, random
from pyngrok import ngrok
from pydantic import BaseModel
from typing import List
from threading import Thread
from transformers import TextIteratorStreamer

NGROK_TOKEN = userdata.get('NGROK_TOKEN')

app = FastAPI()

class Message(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    messages: List[Message]
    max_tokens: int = 1024
    temperature: float = 0.8

@app.post("/chat")
async def chat_endpoint(request: ChatRequest):
    chat = [{"role": m.role, "content": m.content} for m in request.messages]

    # We use return_dict=True to get a dictionary of tensors (input_ids, attention_mask)
    # Then we unpack it into generation_kwargs to avoid BatchEncoding attribute errors.
    model_inputs = tokenizer.apply_chat_template(
        chat,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True
    ).to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = {
        **model_inputs,
        "streamer": streamer,
        "max_new_tokens": request.max_tokens,
        "temperature": request.temperature,
        "do_sample": True,
        "top_p": 0.9,
    }

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    def stream_generator():
        for new_text in streamer:
            yield new_text

    return StreamingResponse(stream_generator(), media_type="text/plain")

ngrok.set_auth_token(NGROK_TOKEN)

# Ensure all existing ngrok tunnels are killed before starting a new one
ngrok.kill()

try:
    for tunnel in ngrok.get_tunnels():
        ngrok.disconnect(tunnel.public_url)
except:
    pass

def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="error")

server_thread = Thread(target=run_server)
server_thread.daemon = True # Allows the main program to exit even if this thread is still running
server_thread.start()

# Give the server a moment to start and bind or fail.
time.sleep(1)

if not server_thread.is_alive():
    # If the thread is not alive after a short delay, it likely failed to start
    print("\n\n--------------------------------------------------")
    print("‚ùå ERROR: FastAPI server failed to start!")
    print("This is likely due to port 8000 already in use.")
    print("Please try one of the following:")
    print("1. Restart the Colab runtime (Runtime > Restart runtime).")
    print("2. If you ran this cell multiple times, wait a few seconds and try again.")
    print("--------------------------------------------------\n\n")
else:
    # If the server thread is alive, assume it started successfully, then connect ngrok
    try:
        public_url = ngrok.connect(8000).public_url
        print("="*50)
        print(f"\nüöÄ BRIDGE ONLINE!\nURL: {public_url}\n")
        print("="*50)
    except Exception as e:
        print(f"‚ùå ERROR: Failed to establish ngrok tunnel: {e}")
        print("Please check your NGROK_TOKEN and network connection.")

try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping server...")



üöÄ BRIDGE ONLINE!
URL: https://aerobically-meddlesome-ria.ngrok-free.dev



Exception in thread Thread-6 (generate):
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py", line 275, in __getattr__
    return self.data[item]
           ~~~~~~~~~^^^^^^
KeyError: 'shape'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/generation/utils.py", line 2504, in generate
    batch_size = inputs_tensor.shape[0]
                 ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_

Stopping server...
