In [None]:
# ============================================================
# ✅ "CancerEmoBERT" Runner (dynamic across all emotions) - single run
# Same structure as your BioBERT runner.
#
# What makes it "CancerBERT-like" correctly:
# 1) Build unlabeled corpus from ALL emotion datasets (TRAIN only, leakage-safe)
# 2) Continued pretraining (MLM / DAPT) starting from BioBERT
# 3) Fine-tune the NEW domain-adapted model on each emotion (8 CSVs)
#
# Output:
# - ./CancerEmoBERT_mlm  (your cancer-specific language model)
# - canceremobert_all_emotions_single_run.csv  (summary results)
# ============================================================

# -------------------------
# 0) Install (Colab safe)
# NOTE: Do NOT upgrade numpy/pandas here. Keep Colab defaults to avoid conflicts.
# If your env is already broken, restart runtime first.
# -------------------------
!pip -q install -U transformers datasets accelerate scikit-learn

import os, random, re, gc
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# -------------------------
# 1) Reproducibility
# -------------------------
SEED = 42
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# -------------------------
# 2) Config
# -------------------------
TEXT_COL  = "Sentence"
SPLIT_COL = "Split"     # 0=train, 1=val, 2=test

# Base model for DAPT (start point)
BASE_MODEL = "dmis-lab/biobert-base-cased-v1.1"

# DAPT output folder (this becomes your "CancerBERT-like" model)
DAPT_DIR = "./CancerEmoBERT_mlm"

# DAPT/MLM params
MLM_MAX_LEN = 128
MLM_EPOCHS  = 2
MLM_LR      = 2e-5
MLM_WD      = 0.01
MLM_WARMUP_RATIO = 0.06
MLM_BS      = 8
MLM_GRAD_ACC= 2
MLM_PROB    = 0.15

# Fine-tune params (same style as your runner)
EPOCHS = 2
TRAIN_BS = 8
EVAL_BS  = 8
WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 50
MAX_LEN = 128

LR_BY_EMOTION = {
    "Sadness": 4e-5,
    "Joy": 5e-5,
    "Fear": 5e-5,
    "Anger": 3e-5,
    "Surprise": 5e-5,
    "Disgust": 3e-5,
    "Trust": 5e-5,
    "Anticipation": 7e-5,
    "Anticip": 7e-5,
}

DATASETS = [
    {"name": "Sadness",  "path": "/content/sample_data/Sadness_anon.csv",       "label": "Sadness"},
    {"name": "Joy",      "path": "/content/sample_data/Joy_anon.csv",           "label": "Joy"},
    {"name": "Fear",     "path": "/content/sample_data/Fear_anon.csv",          "label": "Fear"},
    {"name": "Anger",    "path": "/content/sample_data/Anger_anon.csv",         "label": "Anger"},
    {"name": "Surprise", "path": "/content/sample_data/Surprise_anon.csv",      "label": "Surprise"},
    {"name": "Disgust",  "path": "/content/sample_data/Disgust_anon.csv",       "label": "Disgust"},
    {"name": "Trust",    "path": "/content/sample_data/Trust_anon.csv",         "label": "Trust"},
    {"name": "Anticip",  "path": "/content/sample_data/Anticipation_anon.csv",  "label": "Anticipation"},
]

# -------------------------
# 3) Helpers
# -------------------------
def comp_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "recall": recall_score(labels, preds, zero_division=0),
        "precision": precision_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
    }

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item
    def __len__(self):
        return len(self.labels)

def read_split(csv_path, label_col):
    df = pd.read_csv(csv_path)
    df = df[[TEXT_COL, label_col, SPLIT_COL]].dropna().reset_index(drop=True)

    train_df = df[df[SPLIT_COL] == 0]
    val_df   = df[df[SPLIT_COL] == 1]
    test_df  = df[df[SPLIT_COL] == 2]

    X_train = train_df[TEXT_COL].astype(str).tolist()
    y_train = train_df[label_col].astype(int).tolist()

    X_val   = val_df[TEXT_COL].astype(str).tolist()
    y_val   = val_df[label_col].astype(int).tolist()

    X_test  = test_df[TEXT_COL].astype(str).tolist()
    y_test  = test_df[label_col].astype(int).tolist()

    return df, (X_train, y_train, X_val, y_val, X_test, y_test)

def clean_text(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

# -------------------------
# 4) Step A: Build unlabeled corpus from ALL datasets (TRAIN only)
# -------------------------
CORPUS_TXT = "cancer_corpus_all_emotions.txt"
all_sents = []

for ds in DATASETS:
    df = pd.read_csv(ds["path"])[[TEXT_COL, SPLIT_COL]].dropna()
    df = df[df[SPLIT_COL] == 0]  # leakage-safe
    all_sents.extend(df[TEXT_COL].astype(str).map(clean_text).tolist())

# de-duplicate + drop too short
uniq = []
seen = set()
for s in all_sents:
    if len(s) < 5:
        continue
    if s not in seen:
        uniq.append(s)
        seen.add(s)

with open(CORPUS_TXT, "w", encoding="utf-8") as f:
    for s in uniq:
        f.write(s + "\n")

print("✅ Corpus saved:", CORPUS_TXT)
print("Raw sentences:", len(all_sents))
print("Unique sentences:", len(uniq))

# -------------------------
# 5) Step B: DAPT / MLM continued pretraining (BioBERT -> CancerEmoBERT)
# -------------------------
print("\n" + "="*80)
print("DAPT: Continued pretraining (MLM) from:", BASE_MODEL)
print("="*80)

tokenizer_mlm = AutoTokenizer.from_pretrained(BASE_MODEL)
ds_text = load_dataset("text", data_files={"train": CORPUS_TXT})

def tok_mlm(examples):
    return tokenizer_mlm(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MLM_MAX_LEN
    )

tok_ds = ds_text.map(tok_mlm, batched=True, remove_columns=["text"])

collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_mlm,
    mlm=True,
    mlm_probability=MLM_PROB
)

mlm_model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL)

mlm_args = TrainingArguments(
    output_dir=DAPT_DIR,
    per_device_train_batch_size=MLM_BS,
    gradient_accumulation_steps=MLM_GRAD_ACC,
    num_train_epochs=MLM_EPOCHS,
    learning_rate=MLM_LR,
    weight_decay=MLM_WD,
    warmup_ratio=MLM_WARMUP_RATIO,
    logging_steps=100,
    save_strategy="epoch",
    report_to="none",
    fp16=torch.cuda.is_available(),
    seed=SEED,
)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_args,
    train_dataset=tok_ds["train"],
    data_collator=collator
)

mlm_trainer.train()

mlm_trainer.save_model(DAPT_DIR)
tokenizer_mlm.save_pretrained(DAPT_DIR)
print("✅ Saved CancerEmoBERT (DAPT) at:", DAPT_DIR)

# free memory before fine-tuning loop
del mlm_model, mlm_trainer, tok_ds, ds_text
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# -------------------------
# 6) Fine-tuning loop (dynamic across emotions) using CancerEmoBERT
# -------------------------
print("\n" + "="*80)
print("Fine-tuning using cancer-specific model:", DAPT_DIR)
print("="*80)

tokenizer = AutoTokenizer.from_pretrained(DAPT_DIR)

def tokenize(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN)

def run_one_emotion(ds_name, csv_path, label_col):
    print("\n" + "="*80)
    print(f"Emotion: {ds_name} | CSV: {csv_path}")
    print("="*80)

    df, (X_train, y_train, X_val, y_val, X_test, y_test) = read_split(csv_path, label_col)

    print("Total:", len(df))
    print("Train/Val/Test:", len(X_train), len(X_val), len(X_test))
    print("Train label counts:", pd.Series(y_train).value_counts().to_dict())

    lr = LR_BY_EMOTION.get(ds_name, 2e-5)
    print("Learning rate:", lr)
    print("Model:", DAPT_DIR)

    train_ds = EmotionDataset(tokenize(X_train), y_train)
    val_ds   = EmotionDataset(tokenize(X_val),   y_val)
    test_ds  = EmotionDataset(tokenize(X_test),  y_test)

    def model_init():
        return AutoModelForSequenceClassification.from_pretrained(DAPT_DIR, num_labels=2)

    out_dir = f"./results_CancerEmoBERT_{ds_name}"
    args = TrainingArguments(
        output_dir=out_dir,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        num_train_epochs=EPOCHS,
        learning_rate=lr,
        warmup_steps=WARMUP_STEPS,
        weight_decay=WEIGHT_DECAY,
        logging_dir=f"./logs_CancerEmoBERT_{ds_name}",
        logging_steps=LOGGING_STEPS,
        eval_strategy="epoch",
        save_strategy="no",
        report_to="none",
        seed=SEED,
        fp16=torch.cuda.is_available(),
        gradient_accumulation_steps=1,
    )

    trainer = Trainer(
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=comp_metrics,
        model_init=model_init,
    )

    trainer.train()

    test_metrics = trainer.evaluate(test_ds)
    clean = {
        "test_acc": float(test_metrics.get("eval_accuracy", 0.0)),
        "test_f1": float(test_metrics.get("eval_f1", 0.0)),
        "test_precision": float(test_metrics.get("eval_precision", 0.0)),
        "test_recall": float(test_metrics.get("eval_recall", 0.0)),
    }
    print("TEST:", clean)

    # free memory per emotion
    del trainer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return {
        "dataset": ds_name,
        "n_train": len(X_train),
        "n_val": len(X_val),
        "n_test": len(X_test),
        "lr": lr,
        **clean,
    }

all_rows = []
for ds in DATASETS:
    row = run_one_emotion(ds["name"], ds["path"], ds["label"])
    all_rows.append(row)

summary = pd.DataFrame(all_rows).sort_values("test_f1", ascending=False).reset_index(drop=True)
print("\n================ SUMMARY (sorted by test_f1) ================")
display(summary)

summary.to_csv("canceremobert_all_emotions_single_run.csv", index=False)
print("Saved: canceremobert_all_emotions_single_run.csv")


Device: cuda
✅ Corpus saved: cancer_corpus_all_emotions.txt
Raw sentences: 15915
Unique sentences: 10288

DAPT: Continued pretraining (MLM) from: dmis-lab/biobert-base-cased-v1.1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10288 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Step,Training Loss
100,8.4598
200,5.7172
300,5.1593
400,4.7662
500,4.6026
600,4.3783
700,4.2513
800,4.1907
900,4.191
1000,4.1347


✅ Saved CancerEmoBERT (DAPT) at: ./CancerEmoBERT_mlm

Fine-tuning using cancer-specific model: ./CancerEmoBERT_mlm

Emotion: Sadness | CSV: /content/sample_data/Sadness_anon.csv
Total: 3606
Train/Val/Test: 2884 361 361
Train label counts: {0: 1457, 1: 1427}
Learning rate: 4e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.5322,0.52828,0.739612,0.836735,0.725664,0.777251
2,0.4288,0.577605,0.750693,0.806122,0.752381,0.778325


TEST: {'test_acc': 0.7146814404432132, 'test_f1': 0.7310704960835509, 'test_precision': 0.6896551724137931, 'test_recall': 0.7777777777777778}

Emotion: Joy | CSV: /content/sample_data/Joy_anon.csv
Total: 6043
Train/Val/Test: 4834 604 605
Train label counts: {0: 2424, 1: 2410}
Learning rate: 5e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.483,0.470032,0.786424,0.733119,0.832117,0.779487
2,0.3235,0.588705,0.807947,0.839228,0.798165,0.818182


TEST: {'test_acc': 0.8165289256198347, 'test_f1': 0.8153078202995009, 'test_precision': 0.8166666666666667, 'test_recall': 0.813953488372093}

Emotion: Fear | CSV: /content/sample_data/Fear_anon.csv
Total: 5388
Train/Val/Test: 4310 539 539
Train label counts: {0: 2162, 1: 2148}
Learning rate: 5e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.5555,0.613592,0.664193,0.973077,0.592506,0.736536
2,0.4806,0.601777,0.721707,0.761538,0.692308,0.725275


TEST: {'test_acc': 0.7402597402597403, 'test_f1': 0.7594501718213058, 'test_precision': 0.7466216216216216, 'test_recall': 0.7727272727272727}

Emotion: Anger | CSV: /content/sample_data/Anger_anon.csv
Total: 837
Train/Val/Test: 669 84 84
Train label counts: {1: 344, 0: 325}
Learning rate: 3e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.7047,0.691935,0.583333,0.682927,0.56,0.615385
2,0.6291,0.714644,0.583333,0.560976,0.575,0.567901


TEST: {'test_acc': 0.6785714285714286, 'test_f1': 0.5970149253731343, 'test_precision': 0.6060606060606061, 'test_recall': 0.5882352941176471}

Emotion: Surprise | CSV: /content/sample_data/Surprise_anon.csv
Total: 826
Train/Val/Test: 614 102 110
Train label counts: {0: 307, 1: 307}
Learning rate: 5e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.6904,0.637675,0.627451,0.372549,0.76,0.5
2,0.5243,0.587505,0.705882,0.705882,0.705882,0.705882


TEST: {'test_acc': 0.7272727272727273, 'test_f1': 0.7321428571428571, 'test_precision': 0.7192982456140351, 'test_recall': 0.7454545454545455}

Emotion: Disgust | CSV: /content/sample_data/Disgust_anon.csv
Total: 891
Train/Val/Test: 735 90 66
Train label counts: {1: 369, 0: 366}
Learning rate: 3e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.6975,0.684393,0.555556,0.888889,0.533333,0.666667
2,0.6344,0.6261,0.611111,0.711111,0.592593,0.646465


TEST: {'test_acc': 0.6212121212121212, 'test_f1': 0.5901639344262295, 'test_precision': 0.6428571428571429, 'test_recall': 0.5454545454545454}

Emotion: Trust | CSV: /content/sample_data/Trust_anon.csv
Total: 1887
Train/Val/Test: 1509 189 189
Train label counts: {1: 756, 0: 753}
Learning rate: 5e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,0.709,0.668753,0.62963,0.642105,0.628866,0.635417
2,0.6005,0.692507,0.62963,0.621053,0.634409,0.62766


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TEST: {'test_acc': 0.6455026455026455, 'test_f1': 0.6598984771573604, 'test_precision': 0.625, 'test_recall': 0.6989247311827957}

Emotion: Anticip | CSV: /content/sample_data/Anticipation_anon.csv
Total: 436
Train/Val/Test: 360 34 42
Train label counts: {0: 180, 1: 180}
Learning rate: 7e-05
Model: ./CancerEmoBERT_mlm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./CancerEmoBERT_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1,No log,0.651181,0.647059,0.764706,0.619048,0.684211
2,0.672100,0.822455,0.764706,0.705882,0.8,0.75


TEST: {'test_acc': 0.7380952380952381, 'test_f1': 0.7027027027027027, 'test_precision': 0.8125, 'test_recall': 0.6190476190476191}



Unnamed: 0,dataset,n_train,n_val,n_test,lr,test_acc,test_f1,test_precision,test_recall
0,Joy,4834,604,605,5e-05,0.816529,0.815308,0.816667,0.813953
1,Fear,4310,539,539,5e-05,0.74026,0.75945,0.746622,0.772727
2,Surprise,614,102,110,5e-05,0.727273,0.732143,0.719298,0.745455
3,Sadness,2884,361,361,4e-05,0.714681,0.73107,0.689655,0.777778
4,Anticip,360,34,42,7e-05,0.738095,0.702703,0.8125,0.619048
5,Trust,1509,189,189,5e-05,0.645503,0.659898,0.625,0.698925
6,Anger,669,84,84,3e-05,0.678571,0.597015,0.606061,0.588235
7,Disgust,735,90,66,3e-05,0.621212,0.590164,0.642857,0.545455


Saved: canceremobert_all_emotions_single_run.csv
