In [1]:
pip install pandas transformers datasets torch seqeval scikit-learn

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [2]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("✅ GPU is enabled:", torch.cuda.get_device_name(0))
else:
    print("❌ No GPU detected. Check runtime settings.")


✅ GPU is enabled: Tesla T4


In [3]:
import pandas as pd
dataset = pd.read_csv("Dataprep/cleaned_data.csv")

dataset = dataset.drop(dataset.columns[0], axis=1)
print(dataset.head())


                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  all wrong, back off dear, forward doubt. stay ...  Anxiety
3  i've shifted my focus to something else but i'...  Anxiety
4  i'm restless and restless, it's been a month n...  Anxiety


In [4]:
import spacy
from datasets import Dataset
ner = spacy.blank("en")
#created this dictionary using the EDA on most common words for each category.
#also added my own findings from experimenting with some samples about which words appeared often
symptoms = ["hopeless", "confused", "sleep", "anxious", "no motivation", "depression", "feeling", "die", "kill", "hate", "stress", "anxiety", "work", "too much", "much", "restless", "nervous", "heart", "worried", "pain", "health", "help", "fucking", "cannot", "anymore", "life", "strange", "don't know", "wrong", "never", "job", "insomnia", "sleep", "paranoid", "tired", "fatigue"]
def label_symptoms(text):
  text = str(text)
  tokens = text.split()
  labels = ["Other"] * len(tokens)

  for i, token in enumerate(tokens):
    if token.lower() in symptoms:
      labels[i] = "Symptom"

  return tokens, labels

dataset["tokens"], dataset["labels"] = zip(*dataset["statement"].apply(label_symptoms))
print(list(zip(dataset["tokens"][10], dataset["labels"][10])))

hf_dataset = Dataset.from_pandas(dataset)

[('always', 'Other'), ('restless', 'Symptom'), ('every', 'Other'), ('night,', 'Other'), ('even', 'Other'), ('though', 'Other'), ('i', 'Other'), ("don't", 'Other'), ('know', 'Other'), ('why,', 'Other'), ("what's", 'Other'), ('wrong.', 'Other'), ('strange.', 'Other')]


In [5]:
from transformers import AutoTokenizer

# Load ClinicalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Label mapping
label_map = {"Other": 0, "Symptom": 1}

def encode(example):
    tokens = example["tokens"]
    if not isinstance(tokens, list):
        tokens = tokens.tolist()

    # Tokenize the text while preserving word mapping
    tokenized_input = tokenizer(tokens, truncation=True, is_split_into_words=True,
                                padding="max_length", max_length=128)

    label_map = {"Other": 0, "Symptom": 1}
    labels = example["labels"]
    if not isinstance(labels, list):
        labels = labels.tolist()

    # Convert labels to numerical format
    word_ids = tokenized_input.word_ids()  # Maps tokens to their original word indices
    new_labels = []

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            new_labels.append(-100)  # Ignore special tokens
        elif word_idx != previous_word_idx:
            new_labels.append(label_map.get(labels[word_idx], 0))  # Assign label
        else:
            new_labels.append(-100)  # Ignore subwords

        previous_word_idx = word_idx

    tokenized_input["labels"] = new_labels
    return tokenized_input

# Apply encoding function
hf_dataset = hf_dataset.map(encode, batched=False)

# Split dataset into train and validation sets
dataset_split = hf_dataset.train_test_split(test_size=0.2)

train_dataset = dataset_split["train"]
val_dataset = dataset_split["test"]

print(train_dataset)
print(val_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Map:   0%|          | 0/47617 [00:00<?, ? examples/s]

Dataset({
    features: ['statement', 'status', 'tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 38093
})
Dataset({
    features: ['statement', 'status', 'tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9524
})


In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2, ignore_mismatched_sizes=True)

training_args = TrainingArguments(
    output_dir="./clinicalbert-mentalhealth",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
)

trainer.train()

trainer.save_model("./clinicalbert-mentalhealth")

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


In [None]:
def extract_symptoms(text):
    tokens = tokenizer.tokenize(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Predict
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()

    # Convert tokens and predictions into readable format
    symptoms = [token for token, pred in zip(tokens, predictions) if pred == 1]  # SYMPTOM class

    return " ".join(symptoms).replace(" ##", "")

user_input = "I can't sleep and feel extremely sad."
extracted_symptoms = extract_symptoms(user_input)

print("Extracted Symptoms:", extracted_symptoms)