In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["USE_TF"] = "0"
os.environ["USE_TORCH"] = "1"

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes datasets

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-2cs36s05/unsloth_a5cc7e4bf8174d27b3b447cb71954800
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-2cs36s05/unsloth_a5cc7e4bf8174d27b3b447cb71954800
  Resolved https://github.com/unslothai/unsloth.git to commit 56c8f9662b1bc1fb50bcbe6bcc45ddffb0cdeb60


INFO:werkzeug:127.0.0.1 - - [15/Feb/2026 19:20:32] "POST /generate HTTP/1.1" 200 -


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install flask flask-cors -q
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

from flask import Flask, request, jsonify
from flask_cors import CORS
import torch, subprocess, threading, time, re

# --- Load model ---
from unsloth import FastVisionModel
model, tokenizer = FastVisionModel.from_pretrained(
    model_name = "/content/drive/MyDrive/qwen_verilog_stage4_final",
    load_in_4bit = True,
    max_seq_length = 2048,
)
FastVisionModel.for_inference(model)
print("‚úÖ Model loaded")

# --- Flask app ---
app = Flask(__name__)
CORS(app)

@app.route("/generate", methods=["POST"])
def generate():
    try:
        data        = request.json
        instruction = data.get("instruction", "")
        temperature = float(data.get("temperature", 0.1))
        max_tokens  = int(data.get("max_tokens", 512))

        messages = [
            {"role": "system", "content": "You are an expert Verilog and RTL design engineer."},
            {"role": "user",   "content": instruction}
        ]
        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(text=[text], return_tensors="pt").to("cuda")

        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens = max_tokens,
                temperature    = temperature,
                do_sample      = temperature > 0,
                use_cache      = True,
            )

        generated = output_ids[0][inputs["input_ids"].shape[1]:]
        result    = tokenizer.decode(generated, skip_special_tokens=True)
        return jsonify({"output": result, "status": "ok"})

    except Exception as e:
        return jsonify({"error": str(e), "status": "error"}), 500

@app.route("/health", methods=["GET"])
def health():
    return jsonify({"status": "ok"})

# --- Start Flask in background ---
flask_thread = threading.Thread(
    target=lambda: app.run(host="0.0.0.0", port=5000, use_reloader=False)
)
flask_thread.daemon = True
flask_thread.start()
time.sleep(2)
print("‚úÖ Flask running on port 5000")

# --- Start Cloudflare tunnel ---
cf = subprocess.Popen(
    ["./cloudflared", "tunnel", "--url", "http://localhost:5000"],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT
)

# --- Print the public URL ---
print("‚è≥ Waiting for tunnel URL...")
for line in cf.stdout:
    line = line.decode()
    match = re.search(r'https://[a-z0-9\-]+\.trycloudflare\.com', line)
    if match:
        url = match.group(0)
        print(f"\nüöÄ YOUR API URL: {url}")
        print(f"   Health check:  {url}/health")
        print(f"   Generate:      {url}/generate  (POST)")
        print(f"\nüìã Paste this in your frontend: {url}")
        break

cloudflared: Text file busy
==((====))==  Unsloth 2026.2.1: Fast Qwen2_5_Vl patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
‚úÖ Model loaded
 * Serving Flask app '__main__'
 * Debug mode: off


Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


‚úÖ Flask running on port 5000
‚è≥ Waiting for tunnel URL...

üöÄ YOUR API URL: https://parent-admissions-katie-gabriel.trycloudflare.com
   Health check:  https://parent-admissions-katie-gabriel.trycloudflare.com/health
   Generate:      https://parent-admissions-katie-gabriel.trycloudflare.com/generate  (POST)

üìã Paste this in your frontend: https://parent-admissions-katie-gabriel.trycloudflare.com
