In [1]:
import wandb
wandb.login(key='8a238f5384e7ca02e19fefbb2e272fa99118d5eb')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjainvaibhav0501[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# ==========================
# 1️⃣ Define Labels
# ==========================
sentiment_labels = ["Anxious", "Neutral", "Reassured"]
intent_labels = ["Seeking Reassurance", "Reporting Symptoms", "Expressing Concern"]
num_sentiments = len(sentiment_labels)
num_intents = len(intent_labels)
num_labels = num_sentiments + num_intents  # Multi-label classification

# ==========================
# 2️⃣ Generate Synthetic Data
# ==========================
def generate_synthetic_data(n=150):
    data = []
    for _ in range(n):
        sentiment = np.random.choice(sentiment_labels)
        intent = np.random.choice(intent_labels)
        text = f"Patient: I am feeling {sentiment.lower()} about my condition. I want to {intent.lower()}."
        data.append({"text": text, "sentiment": sentiment, "intent": intent})
    return data

synthetic_data = generate_synthetic_data()

# ==========================
# 3️⃣ Convert Data to HuggingFace Dataset
# ==========================
dataset = Dataset.from_list(synthetic_data)

# ==========================
# 4️⃣ Tokenization & Encoding Labels
# ==========================
model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_encode(example):
    """Tokenizes text and encodes labels in multi-label format"""
    encoding = tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)
    
    # One-hot encode labels
    sentiment_label = [0] * num_sentiments
    intent_label = [0] * num_intents
    sentiment_label[sentiment_labels.index(example["sentiment"])] = 1
    intent_label[intent_labels.index(example["intent"])] = 1

    encoding["labels"] = sentiment_label + intent_label
    return encoding

dataset = dataset.map(tokenize_and_encode, batched=False)
dataset = dataset.train_test_split(test_size=0.2)  # 80% train, 20% test

# ==========================
# 5️⃣ Define Multi-Label Classification Model
# ==========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super(MultiLabelModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.to(device)

    def forward(self, input_ids, attention_mask, labels=None):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        if labels is not None:
            labels = labels.to(device)

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        if labels is not None:
            loss = self.loss_fn(logits, labels.float())  # Ensure labels are float tensors
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

# Initialize Model
model = MultiLabelModel(model_name, num_labels=num_labels)

# ==========================
# 6️⃣ Define Training Arguments
# ==========================
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True  # Enable mixed-precision training for faster performance on GPU
)

# ==========================
# 7️⃣ Define Evaluation Metrics
# ==========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (torch.sigmoid(torch.tensor(logits)).cpu() > 0.5).int().numpy()  # Convert logits to binary predictions
    labels = np.array(labels)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")

    return {"accuracy": acc, "f1_score": f1}





tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



In [3]:
# ==========================
# 8️⃣ Train the Model
# ==========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250317_114814-7m86lbo3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./results[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jainvaibhav0501/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jainvaibhav0501/huggingface/runs/7m86lbo3[0m


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,0.383006,0.533333,0.690476
2,No log,0.174965,0.766667,0.814815
3,No log,0.102319,1.0,1.0
4,No log,0.07443,1.0,1.0
5,No log,0.069048,1.0,1.0


TrainOutput(global_step=75, training_loss=0.24603805541992188, metrics={'train_runtime': 18.1834, 'train_samples_per_second': 32.997, 'train_steps_per_second': 4.125, 'total_flos': 0.0, 'train_loss': 0.24603805541992188, 'epoch': 5.0})

In [4]:
# ==========================
# 9️⃣ Inference on a New Patient Transcript
# ==========================
def analyze_patient_sentiment_and_intent(text):
    """Predict sentiment & intent for a given text"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(**inputs)["logits"]
    probs = torch.sigmoid(logits).cpu().numpy().flatten()  # Move back to CPU for processing

    sentiment_pred = sentiment_labels[np.argmax(probs[:num_sentiments])]
    intent_pred = intent_labels[np.argmax(probs[num_sentiments:])]

    return {"text": text, "sentiment": sentiment_pred, "intent": intent_pred}

# Example Test Sentence
test_sentence = "Patient: I am feeling anxious about my condition. I want to seek reassurance."
result = analyze_patient_sentiment_and_intent(test_sentence)

print(json.dumps(result, indent=2))

{
  "text": "Patient: I am feeling anxious about my condition. I want to seek reassurance.",
  "sentiment": "Anxious",
  "intent": "Seeking Reassurance"
}


In [5]:
# ==========================
# 9️⃣ Save Model Weights
# ==========================
model.model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

print("Model training complete and weights saved!")

Model training complete and weights saved!
