In [1]:
# Install Unsloth (still required for loading FastLanguageModel)
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

# Install bitsandbytes for 4-bit quantization
!pip install bitsandbytes

# Install peft (needed for LoRA)
!pip install peft==0.14.0

# Install transformers & accelerate if not installed already
!pip install transformers accelerate

# Optional: If your model needs xformers (for memory-efficient attention)
!pip install xformers==0.0.28.post3

# (Re)install compatible torch and torchvision versions for CUDA 12.4 (already known from your setup)
!pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124



Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-i4pa9ali
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-i4pa9ali
  Resolved https://github.com/unslothai/unsloth.git to commit dc26a7a0eb20c31549318396f53639ba8c01025e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.8.6-py3-none-any.whl size=308345 sha256=35ac5cd9007cca21a6ac009c9a8b653f5858f7bcb32219691ff19b03f74d9871
  Stored in directory: /tmp/pip-ephem-wheel-cache-7klzjdwa/wheels/d1/17/05/850ab10c33284a4763b0595cd8ea9d01fce6e221cac24b3c01
Successfully built unsloth
Installing collected packages: unsloth
S

In [2]:
!pip install unsloth_zoo


Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.8.5-py3-none-any.whl.metadata (9.4 kB)
Collecting tyro (from unsloth_zoo)
  Downloading tyro-0.9.28-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth_zoo)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth_zoo)
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy (from unsloth_zoo)
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting msgspec (from unsloth_zoo)
  Downloading msgspec-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting shtab>=1.5.6 (from tyro->unsloth_zoo)
  Downloading shtab-1.7.2-py3-none-any.whl.metadata (7.4 kB)
Downloading unsloth_zoo-2025.8.5-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.7/182.7 kB[0m [31m6.5 MB/s[0m eta [36m0:0

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Import necessary libraries
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from huggingface_hub import login
from transformers import TrainingArguments
from flask import Flask, request, jsonify
from transformers import AutoTokenizer
from peft import PeftModel

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# Check HF token
from google.colab import userdata
hf_token = userdata.get('HF_API')
login(hf_token)

In [7]:
import os
from google.colab import userdata

# Get token from Colab's Secrets
ngrok_token = userdata.get('NGROK_TOKEN')

if ngrok_token:
    # Configure ngrok with token
    os.system(f"ngrok config add-authtoken {ngrok_token}")
else:
    raise ValueError("No NGROK_TOKEN found in Colab secrets.")


In [8]:
!mkdir -p /content/templates
!mkdir -p /content/static

In [9]:
!pip install pyngrok
!pip install flask

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [None]:

from pyngrok import ngrok

# Replace with your token
!ngrok config add-authtoken Your_TOKEN


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
from flask import Flask, request, jsonify, render_template
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
from pyngrok import ngrok

# ==== Paths in Google Drive ====
BASE_MODEL = "unsloth/DeepSeek-R1-Distill-Llama-8B-bnb-4bit"
TOKENIZER_PATH = "/content/drive/MyDrive/ML/AI_MED_ASSIST/tokenizer"
LORA_PATH = "/content/drive/MyDrive/ML/AI_MED_ASSIST/lora_adapter"
# ===============================

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

print("Loading base model in 4-bit...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

print("Applying LoRA adapter...")
try:
    model = PeftModel.from_pretrained(base_model, LORA_PATH)
except:
    model = base_model  # fallback

model.eval()

# Flask app looks in /content/templates and /content/static
app = Flask(__name__, template_folder="/content/templates", static_folder="/content/static")

@app.route("/")
def home():
    return render_template("index.html")

@app.route("/generate", methods=["POST"])
def generate():
    try:
        data = request.json
        user_question = data.get("prompt", "")

        if not user_question.strip():
            return jsonify({"error": "Prompt is required"}), 400

        # prompt template
        prompt_template = """### Question:
{}

### Answer:"""
        prompt = prompt_template.format(user_question)

        inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )

        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only after "### Answer:"
        if "### Answer:" in response_text:
            response_text = response_text.split("### Answer:")[-1].strip()

        # Remove <think> blocks
        import re
        response_text = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()

        # Remove stray leading letters like "A " or duplicated question text
        response_text = re.sub(r"^[A-Za-z]\s+", "", response_text).strip()

        return jsonify({"response": response_text})

    except Exception as e:
        return jsonify({"error": str(e)}), 500



# 🔗 Expose public URL
public_url = ngrok.connect(5000)
print("🚀 Public URL:", public_url)
app.run(port=5000)

Loading tokenizer...
Loading base model in 4-bit...
Applying LoRA adapter...
🚀 Public URL: NgrokTunnel: "https://63120b85d5b8.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [16/Aug/2025 15:14:51] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Aug/2025 15:14:52] "GET /static/script.js HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Aug/2025 15:14:52] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [16/Aug/2025 15:16:38] "POST /generate HTTP/1.1" 200 -
