<a href="https://colab.research.google.com/github/Sanjli214/sih-aiml-mental-health-chatbot/blob/main/train_blenderbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [5]:
!pip install -U transformers datasets accelerate evaluate rouge_score sentencepiece torch fastapi uvicorn pyngrok nest-asyncio



In [6]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import numpy as np
import torch

In [7]:
from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok
import nest_asyncio
import threading

nest_asyncio.apply()

In [8]:
from google.colab import files
uploaded = files.upload()

dataset_path = list(uploaded.keys())[0]
ext = dataset_path.split(".")[-1]

if ext == "json":
    raw = load_dataset("json", data_files=dataset_path)
elif ext == "csv":
    raw = load_dataset("csv", data_files=dataset_path)


Saving student_mental_health_dataset.json to student_mental_health_dataset (3).json


Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
if "train" in raw and len(raw) == 1:
    split = raw["train"].train_test_split(test_size=0.1, seed=42)
    datasets = DatasetDict({"train": split["train"], "validation": split["test"]})
else:
    datasets = raw
    if "validation" not in datasets:
        split = datasets["train"].train_test_split(test_size=0.1, seed=42)
        datasets = DatasetDict({"train": split["train"], "validation": split["test"]})

In [10]:
possible_user_cols = ["user", "input", "question", "text"]
possible_bot_cols = ["bot", "response", "answer", "reply"]

user_col = next((c for c in datasets["train"].column_names if c in possible_user_cols), None)
bot_col = next((c for c in datasets["train"].column_names if c in possible_bot_cols), None)
if user_col is None or bot_col is None:
    raise ValueError("Could not detect user or bot columns automatically. Check dataset column names.")

print(f"Using '{user_col}' as user column and '{bot_col}' as bot column.")


Using 'user' as user column and 'bot' as bot column.


In [11]:
model_name = "facebook/blenderbot-400M-distill"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, use_cache=False)

def preprocess(examples):
    inputs = ["You are a supportive college mental health assistant. " + u for u in examples[user_col]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples[bot_col], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = datasets.map(preprocess, batched=True, remove_columns=datasets["train"].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/189 [00:00<?, ? examples/s]



Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

training_args = Seq2SeqTrainingArguments(
    output_dir="./blenderbot-student-finetuned",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    predict_with_generate=True,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()
trainer.save_model("./blenderbot-student-finetuned")
tokenizer.save_pretrained("./blenderbot-student-finetuned")
print("Model fine-tuning complete.")



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [1]:
!kill -9 $(lsof -t -i:8000)
from pyngrok import ngrok
ngrok.kill()


kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]


In [2]:
# =============================
# 🔹 FastAPI + ngrok Setup (Colab-friendly)
# =============================
import threading
import time
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok

# -----------------------------
# 1️⃣ FastAPI app
# -----------------------------
app = FastAPI()

class UserMessage(BaseModel):
    message: str

@app.post("/chat")
def chat(user_msg: UserMessage):
    inputs = tokenizer([user_msg.message], return_tensors="pt")
    with torch.no_grad():
        reply_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=200
        )
    reply = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
    return {"reply": reply}

# -----------------------------
# 2️⃣ Ngrok authentication
# -----------------------------
NGROK_AUTH_TOKEN = "32GMwiMlVNGXoQv0NVFbVNCa8FO_3XVAwFJ2S1k3t267f63RE"  # replace with your token
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# -----------------------------
# 3️⃣ Function to run FastAPI in a thread
# -----------------------------
def run_api():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Start FastAPI in background
threading.Thread(target=run_api, daemon=True).start()

# Wait a few seconds to ensure the server starts
time.sleep(2)

# -----------------------------
# 4️⃣ Start ngrok tunnel
# -----------------------------
public_url = ngrok.connect(8000)
print(f"Your teammates can now access the chatbot at: {public_url}/chat")


INFO:     Started server process [28485]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Your teammates can now access the chatbot at: NgrokTunnel: "https://58a8a12d23cf.ngrok-free.app" -> "http://localhost:8000"/chat
