# T-VisStar-7B LLM Server

This notebook runs T-VisStar-7B model as an API server on Google Colab with Cloudflare Tunnel.

**Requirements:**
- Google Colab with GPU runtime (T4 recommended)
- Runtime > Change runtime type > GPU

**Usage:**
1. Run all cells
2. Copy the Cloudflare URL from the last cell output
3. Set `LLM_API_URL` in your backend `.env` file

In [None]:
# Cell 1: Install dependencies
!pip install -q transformers accelerate bitsandbytes flask

# Download Cloudflare Tunnel binary
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

print("Dependencies installed!")

In [None]:
# Cell 2: Load model with 4-bit quantization
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Model configuration
MODEL_ID = os.getenv("LLM_MODEL_ID", "1TuanPham/T-VisStar-7B-v0.1")

print(f"Loading model: {MODEL_ID}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# 4-bit quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.eval()

print("Model loaded successfully!")

In [None]:
# Cell 3: Create Flask API server
from flask import Flask, request, jsonify
import threading
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

@app.route("/health", methods=["GET"])
def health():
    """Health check endpoint."""
    return jsonify({
        "status": "ok",
        "model": MODEL_ID,
        "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
    })

@app.route("/chat/complete", methods=["POST"])
def chat_complete():
    """Chat completion endpoint - compatible with LocalLLM interface."""
    try:
        data = request.json
        messages = data.get("messages", [])
        max_new_tokens = data.get("max_new_tokens", 512)
        temperature = data.get("temperature", 0.7)

        logger.info(f"Generating response: max_tokens={max_new_tokens}, temp={temperature}")

        # Apply chat template
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=temperature > 0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Decode only new tokens
        response = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )

        logger.info(f"Generated {len(response)} characters")
        return jsonify({"response": response.strip()})

    except Exception as e:
        logger.error(f"Error: {e}")
        return jsonify({"error": str(e)}), 500

# Run Flask in background thread
def run_server():
    app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)

server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()

print("Flask server started on port 5000")

In [None]:
# Cell 4: Start Cloudflare Tunnel
import subprocess
import re
import time

print("Starting Cloudflare Tunnel...")

# Start cloudflared tunnel
process = subprocess.Popen(
    ['./cloudflared', 'tunnel', '--url', 'http://localhost:5000'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

# Wait for tunnel to establish
time.sleep(3)

# Read stderr to find the URL
public_url = None
for _ in range(20):  # Try for up to 20 lines
    line = process.stderr.readline()
    if not line:
        time.sleep(0.5)
        continue
    print(line.strip())  # Debug output
    match = re.search(r'https://[a-z0-9-]+\.trycloudflare\.com', line)
    if match:
        public_url = match.group()
        break

if public_url:
    print("\n" + "="*60)
    print("LLM SERVER READY!")
    print("="*60)
    print(f"\nPublic URL: {public_url}")
    print(f"\nAdd to your backend .env file:")
    print(f"LLM_API_URL={public_url}")
    print("\n" + "="*60)
else:
    print("\nFailed to get Cloudflare URL. Check output above for errors.")
    print("You may need to restart this cell.")

In [None]:
# Cell 5: Test the API (optional)
import requests

# Test health endpoint
response = requests.get("http://localhost:5000/health")
print("Health check:", response.json())

# Test chat completion
test_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello, how are you?"}
]

response = requests.post(
    "http://localhost:5000/chat/complete",
    json={"messages": test_messages, "max_new_tokens": 50, "temperature": 0.7}
)
print("\nChat response:", response.json())

In [None]:
# Cell 6: Keep notebook alive
# Run this cell to prevent Colab from timing out
import time
from IPython.display import clear_output

print("Keeping notebook alive... (Ctrl+C or stop cell to exit)")
print(f"LLM API URL: {public_url}" if public_url else "URL not available")

counter = 0
while True:
    counter += 1
    clear_output(wait=True)
    print(f"Server running... Uptime: {counter} minutes")
    print(f"LLM API URL: {public_url}" if public_url else "URL not available")
    time.sleep(60)