# installations

In [None]:
# !pip install pandas
# # ------------------------------------------------------------
# # 0. Install (once) if needed
# !pip install -q transformers datasets evaluate scikit-learn torch
# # ------------------------------------------------------------
# !pip install transformers[torch]
# !pip install accelerate>=0.26.0
# !pip install openpyxl

# uploading data

In [None]:
import pandas as pd

# Use raw string (r'...') to avoid issues with backslashes in Windows paths
# file_path = r'FilteredData100.xlsx'
file_path = r'/home/eldord/Big-Data-Abstracts-Classification/data/FilteredData100.xlsx' # Adjust the path as needed


# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Preview the first few rows
print(df.head())


   Y1  Y2    Y    Domain                     area  \
0   0  12   12       CS    Symbolic computation     
1   5   2   74  Medical     Alzheimer's Disease     
2   4   7   68    Civil          Green Building     
3   1  10   26      ECE          Electric motor     
4   5  43  115  Medical     Parkinson's Disease     

                                            keywords  \
0   (2+1)-dimensional non-linear optical waves; e...   
1   Aging; Tau; Amyloid; PET; Alzheimer's disease...   
2   LED lighting system; PV system; Distributed l...   
3   NdFeB magnets; Electric motor; Electric vehic...   
4   Parkinson's disease; dyskinesia; adenosine A(...   

                                            Abstract  
0  (2 + 1)-dimensional non-linear optical waves t...  
1  (beta-amyloid (A beta) and tau pathology becom...  
2  (D)ecreasing of energy consumption and environ...  
3  (Hybrid) electric vehicles are assumed to play...  
4  (L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema...  


# BERT

## domain

In [9]:

import os, random, numpy as np, pandas as pd, torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate

# --------------------------- CONFIG --------------------------
SEED         = 42
MODEL_NAME   = "bert-base-uncased"          # vanilla BERT
LABEL_COL    = "Domain"                     # <- CHANGE if your label lives elsewhere
TEXT_COL     = "Abstract"
BATCH_SIZE   = 8
EPOCHS       = 3
LR           = 2e-5
# -------------------------------------------------------------

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# 1.-- Prepare the data -------------------------------------------------------
assert LABEL_COL in df.columns, f"{LABEL_COL} not found in dataframe"

# map label strings → integer ids
label_list        = sorted(df[LABEL_COL].unique())
label2id          = {l:i for i,l in enumerate(label_list)}
id2label          = {i:l for l,i in label2id.items()}
df["label"]       = df[LABEL_COL].map(label2id)

# train / val / test split (70/15/15)
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df["label"],
                                     random_state=SEED)
val_df, test_df   = train_test_split(temp_df, test_size=0.50, stratify=temp_df["label"],
                                     random_state=SEED)

hf_dset = DatasetDict({
    "train": Dataset.from_pandas(train_df[[TEXT_COL, "label"]]),
    "validation": Dataset.from_pandas(val_df[[TEXT_COL, "label"]]),
    "test": Dataset.from_pandas(test_df[[TEXT_COL, "label"]])
})

# 2.-- Tokenisation -----------------------------------------------------------
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tok(batch[TEXT_COL], truncation=True)
hf_dset_tok = hf_dset.map(tokenize, batched=True, remove_columns=[TEXT_COL])
data_collator = DataCollatorWithPadding(tok)

# 3.-- Model -----------------------------------------------------------------
model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(label_list),
            id2label=id2label,
            label2id=label2id)

metric_acc  = evaluate.load("accuracy")
metric_f1   = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

# 4.-- Helper: quick evaluation ---------------------------------------------
def quick_eval(split: str, tag: str):
    trainer = Trainer(
        model=model,
        tokenizer=tok,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    print(f"\n[{tag}] Evaluating on {split} set …")
    metrics = trainer.evaluate(hf_dset_tok[split], metric_key_prefix=tag)
    for k,v in metrics.items():
        if k.startswith(tag):
            print(f"{k:<20}: {v:.4f}")
    return metrics

# Baseline (zero-shot) on val & test
baseline_val  = quick_eval("validation", tag="zero")
baseline_test = quick_eval("test",        tag="zero")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 32634/32634 [00:08<00:00, 3728.65 examples/s]
Map: 100%|██████████| 6993/6993 [00:01<00:00, 3537.72 examples/s]
Map: 100%|██████████| 6993/6993 [00:01<00:00, 3760.13 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 27.7MB/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<00:00, 39.4MB/s]
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true 


[zero] Evaluating on validation set …


zero_loss           : 2.1356
zero_model_preparation_time: 0.0024
zero_accuracy       : 0.0781
zero_macro_f1       : 0.0329
zero_runtime        : 28.0963
zero_samples_per_second: 248.8940
zero_steps_per_second: 31.1430

[zero] Evaluating on test set …


  trainer = Trainer(


zero_loss           : 2.1388
zero_model_preparation_time: 0.0022
zero_accuracy       : 0.0766
zero_macro_f1       : 0.0324
zero_runtime        : 23.7920
zero_samples_per_second: 293.9220
zero_steps_per_second: 36.7770


In [11]:

# 5.-- Fine-tune --------------------------------------------------------------
args = TrainingArguments(
    output_dir            = "bert_abstract_cls",
    eval_strategy   = "epoch",
    save_strategy         = "no",
    learning_rate         = LR,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size  = BATCH_SIZE,
    num_train_epochs      = EPOCHS,
    seed                  = SEED,
    weight_decay          = 0.01,
    logging_steps         = 50,
)

trainer = Trainer(
    model               = model,
    args                = args,
    train_dataset       = hf_dset_tok["train"],
    eval_dataset        = hf_dset_tok["validation"],
    tokenizer           = tok,
    data_collator       = data_collator,
    compute_metrics     = compute_metrics,
)

trainer.train()

# 6.-- Post-training evaluation ----------------------------------------------
fin_val  = quick_eval("validation", tag="fine")
fin_test = quick_eval("test",        tag="fine")

# 7.-- Simple side-by-side comparison ----------------------------------------
def compare(before, after, split):
    print(f"\n=== {split.upper()} SET ===")
    for m in ("accuracy","macro_f1"):
        b = before[f"zero_{m}"]; a = after[f"fine_{m}"]
        diff = a - b
        print(f"{m:>10}: {b:.4f} → {a:.4f}   (Δ {diff:+.4f})")

compare(baseline_val,  fin_val,  "validation")
compare(baseline_test, fin_test, "test")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.5568,0.351497,0.896039,0.89664
2,0.2091,0.365154,0.910053,0.91285
3,0.1595,0.411463,0.913199,0.916731



[fine] Evaluating on validation set …


  trainer = Trainer(


fine_loss           : 0.4115
fine_model_preparation_time: 0.0022
fine_accuracy       : 0.9132
fine_macro_f1       : 0.9167
fine_runtime        : 23.7944
fine_samples_per_second: 293.8930
fine_steps_per_second: 36.7730

[fine] Evaluating on test set …


  trainer = Trainer(


fine_loss           : 0.3970
fine_model_preparation_time: 0.0022
fine_accuracy       : 0.9143
fine_macro_f1       : 0.9169
fine_runtime        : 23.8635
fine_samples_per_second: 293.0420
fine_steps_per_second: 36.6670

=== VALIDATION SET ===
  accuracy: 0.0781 → 0.9132   (Δ +0.8351)
  macro_f1: 0.0329 → 0.9167   (Δ +0.8839)

=== TEST SET ===
  accuracy: 0.0766 → 0.9143   (Δ +0.8377)
  macro_f1: 0.0324 → 0.9169   (Δ +0.8845)


## area

In [None]:

import os, random, numpy as np, pandas as pd, torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate

# --------------------------- CONFIG --------------------------
SEED         = 42
MODEL_NAME   = "bert-base-uncased"          # vanilla BERT
LABEL_COL    = "area"                     # <- CHANGE if your label lives elsewhere
TEXT_COL     = "Abstract"
BATCH_SIZE   = 8
EPOCHS       = 3
LR           = 2e-5
# -------------------------------------------------------------

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# 1.-- Prepare the data -------------------------------------------------------
assert LABEL_COL in df.columns, f"{LABEL_COL} not found in dataframe"

# map label strings → integer ids
label_list        = sorted(df[LABEL_COL].unique())
label2id          = {l:i for i,l in enumerate(label_list)}
id2label          = {i:l for l,i in label2id.items()}
df["label"]       = df[LABEL_COL].map(label2id)

# train / val / test split (70/15/15)
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df["label"],
                                     random_state=SEED)
val_df, test_df   = train_test_split(temp_df, test_size=0.50, stratify=temp_df["label"],
                                     random_state=SEED)

hf_dset = DatasetDict({
    "train": Dataset.from_pandas(train_df[[TEXT_COL, "label"]]),
    "validation": Dataset.from_pandas(val_df[[TEXT_COL, "label"]]),
    "test": Dataset.from_pandas(test_df[[TEXT_COL, "label"]])
})

# 2.-- Tokenisation -----------------------------------------------------------
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tok(batch[TEXT_COL], truncation=True)
hf_dset_tok = hf_dset.map(tokenize, batched=True, remove_columns=[TEXT_COL])
data_collator = DataCollatorWithPadding(tok)

# 3.-- Model -----------------------------------------------------------------
model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=len(label_list),
            id2label=id2label,
            label2id=label2id)

metric_acc  = evaluate.load("accuracy")
metric_f1   = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

# 4.-- Helper: quick evaluation ---------------------------------------------
def quick_eval(split: str, tag: str):
    trainer = Trainer(
        model=model,
        tokenizer=tok,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    print(f"\n[{tag}] Evaluating on {split} set …")
    metrics = trainer.evaluate(hf_dset_tok[split], metric_key_prefix=tag)
    for k,v in metrics.items():
        if k.startswith(tag):
            print(f"{k:<20}: {v:.4f}")
    return metrics

# Baseline (zero-shot) on val & test
baseline_val  = quick_eval("validation", tag="zero")
baseline_test = quick_eval("test",        tag="zero")


# BERT-base-uncased-MNLI

## domain

In [1]:
# ===========================================================================
# Web-of-Science   ·   Domain-level baseline → fine-tune
# ===========================================================================

# !pip install -qq datasets transformers evaluate tqdm

import random, os, numpy as np, torch, evaluate
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, pipeline)
from tqdm.auto import tqdm

SEED            = 42
VARIANT         = "WOS46985"               # "WOS11967" / "WOS5736" also work
ZS_MODEL_NAME   = "textattack/bert-base-uncased-MNLI"
BATCH_SIZE_ZS   = 64                       # >= 32 is safe with a 12-GB GPU
BATCH_SIZE_FT   = 8
EPOCHS          = 10
LR              = 2e-5

rng = random.Random(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ---------------------------------------------------------------------------
# 1.  LOAD YOUR EXCEL SHEET  (instead of load_dataset)  ----------------------
# ---------------------------------------------------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

SEED = 42                                    # keep the same seed everywhere
TEXT_COL  = "Abstract"                       # column that holds the text
LABEL_COL = "Domain"                         # ← change to "area" later on
FILE_PATH = r"FilteredData100.xlsx"          # your file

# -- read the sheet ----------------------------------------------------------
df = pd.read_excel(FILE_PATH)[[TEXT_COL, LABEL_COL]].dropna()

# -- label ↔ id mappings -----------------------------------------------------
# label_list = sorted(df[LABEL_COL].unique())
# label2id   = {l:i for i,l in enumerate(label_list)}
# id2label   = {i:l for l,i in label2id.items()}
# df["label"] = df[LABEL_COL].map(label2id)

# ------------------------------------------------------------------
# clean up label strings and build the canonical list  -------------
# ------------------------------------------------------------------
df[LABEL_COL] = df[LABEL_COL].str.strip()            # remove leading/trailing spaces
label_list  = sorted(df[LABEL_COL].unique())         # canonical list (7 labels)
label2id    = {l: i for i, l in enumerate(label_list)}
id2label    = {i: l for l, i in label2id.items()}
df["label"] = df[LABEL_COL].map(label2id)

print("Domain labels:", label_list)   # sanity-check


# -- split 70 / 15 / 15 ------------------------------------------------------
train_df, temp_df = train_test_split(
        df, test_size=0.30, stratify=df["label"], random_state=SEED)
val_df,   test_df = train_test_split(
        temp_df, test_size=0.50, stratify=temp_df["label"], random_state=SEED)

hf_dset = DatasetDict({
    "train":      Dataset.from_pandas(train_df[[TEXT_COL, "label"]],
                                      preserve_index=False),
    "validation": Dataset.from_pandas(val_df[[TEXT_COL, "label"]],
                                      preserve_index=False),
    "test":       Dataset.from_pandas(test_df[[TEXT_COL, "label"]],
                                      preserve_index=False),
})

print(f"Loaded {len(df):,} rows   –   {len(label_list)} unique labels:")
print(label_list)


# ---------------------------------------------------------------------------
# 2.  ZERO-SHOT evaluation with MNLI-BERT   (fast: N × 7 only) --------------
# ---------------------------------------------------------------------------

mnli = AutoModelForSequenceClassification.from_pretrained(ZS_MODEL_NAME)
mnli.config.label2id = {"contradiction": 0, "neutral": 1, "entailment": 2}
mnli.config.id2label = {v:k for k,v in mnli.config.label2id.items()}

zs_pipe = pipeline("zero-shot-classification",
                   model     = mnli,
                   tokenizer = ZS_MODEL_NAME,
                   device    = 0 if torch.cuda.is_available() else -1,
                   batch_size= BATCH_SIZE_ZS)

def zs_predict(texts, labels=label_list):
    preds = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE_ZS), desc="zero-shot"):
        chunk = texts[i:i+BATCH_SIZE_ZS]
        outs  = zs_pipe(chunk,
                        candidate_labels = labels,
                        hypothesis_template = "This abstract is about {}.",
                        multi_label=False)
        preds.extend(label2id[o["labels"][0]] for o in outs)
    return preds

metric_acc = evaluate.load("accuracy");  metric_f1 = evaluate.load("f1")

# def eval_zs(split):
#     refs  = hf_dset[split]["label"]
#     preds = zs_predict(hf_dset[split]["text"])
#     return {
#         "zero_accuracy": metric_acc.compute(predictions=preds, references=refs)["accuracy"],
#         "zero_macro_f1": metric_f1.compute(predictions=preds, references=refs,
#                                            average="macro")["f1"]
#     }

def eval_zs(split):
    refs  = hf_dset[split]["label"]
    preds = zs_predict(hf_dset[split][TEXT_COL])     # ← changed
    return {
        "zero_accuracy": metric_acc.compute(predictions=preds, references=refs)["accuracy"],
        "zero_macro_f1": metric_f1.compute(predictions=preds, references=refs,
                                           average="macro")["f1"]
    }

print("\n>>> ZERO-SHOT (domain-level)")
baseline_val  = eval_zs("validation");  print("validation:", baseline_val)
baseline_test = eval_zs("test");        print("test:      ", baseline_test)

# ---------------------------------------------------------------------------
# 3.  Tokeniser & fine-tune --------------------------------------------------
# ---------------------------------------------------------------------------
tok = AutoTokenizer.from_pretrained(ZS_MODEL_NAME)

# def tok_fn(batch): return tok(batch["text"], truncation=True)
# ds_tok = hf_dset.map(tok_fn, batched=True, remove_columns=["text"])

def tok_fn(batch):                       # tokenise *Abstract* column
    return tok(batch[TEXT_COL], truncation=True)

ds_tok = hf_dset.map(tok_fn, batched=True, remove_columns=[TEXT_COL])

from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tok)

model = AutoModelForSequenceClassification.from_pretrained(
            ZS_MODEL_NAME,
            # num_labels = len(dom_names),
            num_labels = len(label_list),  # 7-way classification
            id2label   = id2label,
            label2id   = label2id,
            ignore_mismatched_sizes = True)          # swaps 3-way head → 7-way

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = np.argmax(logits, axis=-1)
#     return {"accuracy": metric_acc.compute(preds, labels)["accuracy"],
#             "macro_f1": metric_f1.compute(preds, labels, average="macro")["f1"]}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds,
                                       references=labels)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=preds,
                                      references=labels,
                                      average="macro")["f1"],
    }


args = TrainingArguments(
    output_dir                 = "bert_wos_domains",
    learning_rate              = LR,
    per_device_train_batch_size= BATCH_SIZE_FT,
    per_device_eval_batch_size = BATCH_SIZE_FT,
    eval_strategy        = "epoch",
    save_strategy              = "epoch",
    num_train_epochs           = EPOCHS,
    seed                       = SEED,
    weight_decay               = 0.01,
    logging_steps              = 100,
)

trainer = Trainer(model=model, args=args,
                  train_dataset = ds_tok["train"],
                  eval_dataset  = ds_tok["validation"],
                  tokenizer     = tok,
                  data_collator = collator,
                  compute_metrics = compute_metrics)

trainer.train()

print("\n>>> POST-TUNING EVALUATION")
fin_val  = trainer.evaluate(ds_tok["validation"], metric_key_prefix="fine")
fin_test = trainer.evaluate(ds_tok["test"],        metric_key_prefix="fine")

def compare(before, after, split):
    print(f"\n=== {split.upper()} ===")
    for m in ("accuracy", "macro_f1"):
        b = before["zero_"+m];  a = after["fine_"+m]
        print(f"{m:>10}: {b:.4f} → {a:.4f}   (Δ {a-b:+.4f})")

compare(baseline_val,  fin_val,  "validation")
compare(baseline_test, fin_test, "test")


  from .autonotebook import tqdm as notebook_tqdm


Domain labels: ['CS', 'Civil', 'ECE', 'MAE', 'Medical', 'Psychology', 'biochemistry']
Loaded 46,620 rows   –   7 unique labels:
['CS', 'Civil', 'ECE', 'MAE', 'Medical', 'Psychology', 'biochemistry']


Device set to use cuda:0



>>> ZERO-SHOT (domain-level)


zero-shot:   9%|▉         | 10/110 [00:15<02:36,  1.57s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
zero-shot: 100%|██████████| 110/110 [02:51<00:00,  1.56s/it]


validation: {'zero_accuracy': 0.25368225368225367, 'zero_macro_f1': 0.1847543961810419}


zero-shot: 100%|██████████| 110/110 [02:53<00:00,  1.57s/it]


test:       {'zero_accuracy': 0.2621192621192621, 'zero_macro_f1': 0.19189073650322144}


Map: 100%|██████████| 32634/32634 [00:24<00:00, 1336.95 examples/s]
Map: 100%|██████████| 6993/6993 [00:05<00:00, 1302.52 examples/s]
Map: 100%|██████████| 6993/6993 [00:05<00:00, 1328.07 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at textattack/bert-base-uncased-MNLI and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.3718,0.38234,0.889175,0.890532
2,0.3363,0.370578,0.90705,0.909362
3,0.1967,0.426103,0.904619,0.906551
4,0.1781,0.494778,0.907479,0.910257
5,0.1161,0.573084,0.903332,0.907134
6,0.0544,0.607182,0.906621,0.910623
7,0.0485,0.681293,0.907622,0.910343
8,0.0176,0.711535,0.908766,0.912104
9,0.0059,0.79302,0.911912,0.915305
10,0.0012,0.808164,0.909481,0.912498



>>> POST-TUNING EVALUATION



=== VALIDATION ===
  accuracy: 0.2537 → 0.9095   (Δ +0.6558)
  macro_f1: 0.1848 → 0.9125   (Δ +0.7277)

=== TEST ===
  accuracy: 0.2621 → 0.9105   (Δ +0.6484)
  macro_f1: 0.1919 → 0.9122   (Δ +0.7203)


## area

In [1]:
# ===========================================================================
# Web-of-Science   ·   Domain-level baseline → fine-tune
# ===========================================================================

# !pip install -qq datasets transformers evaluate tqdm

import random, os, numpy as np, torch, evaluate
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, pipeline)
from tqdm.auto import tqdm

SEED            = 42
VARIANT         = "WOS46985"               # "WOS11967" / "WOS5736" also work
ZS_MODEL_NAME   = "textattack/bert-base-uncased-MNLI"
BATCH_SIZE_ZS   = 64                       # >= 32 is safe with a 12-GB GPU
BATCH_SIZE_FT   = 8
EPOCHS          = 10
LR              = 2e-5

rng = random.Random(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ---------------------------------------------------------------------------
# 1.  LOAD YOUR EXCEL SHEET  (instead of load_dataset)  ----------------------
# ---------------------------------------------------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

SEED = 42                                    # keep the same seed everywhere
TEXT_COL  = "Abstract"                       # column that holds the text
LABEL_COL = "area"                         # ← change to "area" later on
FILE_PATH = r"FilteredData100.xlsx"          # your file

# -- read the sheet ----------------------------------------------------------
df = pd.read_excel(FILE_PATH)[[TEXT_COL, LABEL_COL]].dropna()

# ------------------------------------------------------------------
# clean up label strings and build the canonical list  -------------
# ------------------------------------------------------------------
# df[LABEL_COL] = df[LABEL_COL].str.strip()            # remove leading/trailing spaces
# label_list  = sorted(df[LABEL_COL].unique())         # canonical list (7 labels)
# label2id    = {l: i for i, l in enumerate(label_list)}
# id2label    = {i: l for l, i in label2id.items()}
# df["label"] = df[LABEL_COL].map(label2id)

# print("Domain labels:", label_list)   # sanity-check

# Clean area names and build readable names
df[LABEL_COL] = df[LABEL_COL].str.strip()

def prettify_area(code):
    rep = (("BIOCHEM", "biochemistry"), ("MOL", "molecular"), ("BIOL", "biology"),
           ("CS", "computer science"), ("NLP", "natural language processing"),
           ("IR", "information retrieval"), ("EE", "electrical engineering"),
           ("MECH", "mechanical engineering"), ("MED", "medical science"),
           ("CHEM", "chemistry"), ("PHY", "physics"))
    txt = code.lower().replace('-', ' ')
    for a, b in rep:
        txt = txt.replace(a.lower(), b)
    return txt.replace("  ", " ").strip()

label_list  = sorted(df[LABEL_COL].unique())              # 134 areas
nice_names  = {label: prettify_area(label) for label in label_list}
labels_for_zs = [nice_names[label] for label in label_list]

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
df["label"] = df[LABEL_COL].map(label2id)


# -- split 70 / 15 / 15 ------------------------------------------------------
train_df, temp_df = train_test_split(
        df, test_size=0.30, stratify=df["label"], random_state=SEED)
val_df,   test_df = train_test_split(
        temp_df, test_size=0.50, stratify=temp_df["label"], random_state=SEED)

hf_dset = DatasetDict({
    "train":      Dataset.from_pandas(train_df[[TEXT_COL, "label"]],
                                      preserve_index=False),
    "validation": Dataset.from_pandas(val_df[[TEXT_COL, "label"]],
                                      preserve_index=False),
    "test":       Dataset.from_pandas(test_df[[TEXT_COL, "label"]],
                                      preserve_index=False),
})

print(f"Loaded {len(df):,} rows   –   {len(label_list)} unique labels:")
print(label_list)


# ---------------------------------------------------------------------------
# 2.  ZERO-SHOT evaluation with MNLI-BERT   (fast: N × 7 only) --------------
# ---------------------------------------------------------------------------

mnli = AutoModelForSequenceClassification.from_pretrained(ZS_MODEL_NAME)
mnli.config.label2id = {"contradiction": 0, "neutral": 1, "entailment": 2}
mnli.config.id2label = {v:k for k,v in mnli.config.label2id.items()}

zs_pipe = pipeline("zero-shot-classification",
                   model     = mnli,
                   tokenizer = ZS_MODEL_NAME,
                   device    = 0 if torch.cuda.is_available() else -1,
                   batch_size= BATCH_SIZE_ZS)

# def zs_predict(texts, labels=label_list):
#     preds = []
#     for i in tqdm(range(0, len(texts), BATCH_SIZE_ZS), desc="zero-shot"):
#         chunk = texts[i:i+BATCH_SIZE_ZS]
#         outs  = zs_pipe(chunk,
#                         candidate_labels = labels,
#                         hypothesis_template = "This abstract is about {}.",
#                         multi_label=False)
#         preds.extend(label2id[o["labels"][0]] for o in outs)
#     return preds

def zs_predict(texts, labels=labels_for_zs):
    preds = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE_ZS), desc="zero-shot"):
        chunk = texts[i:i+BATCH_SIZE_ZS]
        outs = zs_pipe(chunk,
                       candidate_labels = labels,
                       hypothesis_template = "This abstract is about {}.",
                       multi_label=False)
        for o in outs:
            predicted_nl = o["labels"][0]
            original = next(k for k, v in nice_names.items() if v == predicted_nl)
            preds.append(label2id[original])
    return preds


metric_acc = evaluate.load("accuracy");  metric_f1 = evaluate.load("f1")


def eval_zs(split):
    refs  = hf_dset[split]["label"]
    preds = zs_predict(hf_dset[split][TEXT_COL])     # ← changed
    return {
        "zero_accuracy": metric_acc.compute(predictions=preds, references=refs)["accuracy"],
        "zero_macro_f1": metric_f1.compute(predictions=preds, references=refs,
                                           average="macro")["f1"]
    }

print("\n>>> ZERO-SHOT (domain-level)")
baseline_val  = eval_zs("validation");  print("validation:", baseline_val)
baseline_test = eval_zs("test");        print("test:      ", baseline_test)

# ---------------------------------------------------------------------------
# 3.  Tokeniser & fine-tune --------------------------------------------------
# ---------------------------------------------------------------------------
tok = AutoTokenizer.from_pretrained(ZS_MODEL_NAME)


def tok_fn(batch):                       # tokenise *Abstract* column
    return tok(batch[TEXT_COL], truncation=True)

ds_tok = hf_dset.map(tok_fn, batched=True, remove_columns=[TEXT_COL])

from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tok)

model = AutoModelForSequenceClassification.from_pretrained(
            ZS_MODEL_NAME,
            # num_labels = len(dom_names),
            num_labels = len(label_list),  # 7-way classification
            id2label   = id2label,
            label2id   = label2id,
            ignore_mismatched_sizes = True)          # swaps 3-way head → 7-way


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds,
                                       references=labels)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=preds,
                                      references=labels,
                                      average="macro")["f1"],
    }


args = TrainingArguments(
    output_dir                 = "bert_wos_domains",
    learning_rate              = LR,
    per_device_train_batch_size= BATCH_SIZE_FT,
    per_device_eval_batch_size = BATCH_SIZE_FT,
    eval_strategy        = "epoch",
    save_strategy              = "epoch",
    num_train_epochs           = EPOCHS,
    seed                       = SEED,
    weight_decay               = 0.01,
    logging_steps              = 100,
)

trainer = Trainer(model=model, args=args,
                  train_dataset = ds_tok["train"],
                  eval_dataset  = ds_tok["validation"],
                  tokenizer     = tok,
                  data_collator = collator,
                  compute_metrics = compute_metrics)

trainer.train()

print("\n>>> POST-TUNING EVALUATION")
fin_val  = trainer.evaluate(ds_tok["validation"], metric_key_prefix="fine")
fin_test = trainer.evaluate(ds_tok["test"],        metric_key_prefix="fine")

def compare(before, after, split):
    print(f"\n=== {split.upper()} ===")
    for m in ("accuracy", "macro_f1"):
        b = before["zero_"+m];  a = after["fine_"+m]
        print(f"{m:>10}: {b:.4f} → {a:.4f}   (Δ {a-b:+.4f})")

compare(baseline_val,  fin_val,  "validation")
compare(baseline_test, fin_test, "test")


  from .autonotebook import tqdm as notebook_tqdm


Loaded 46,620 rows   –   130 unique labels:
['Addiction', 'Algorithm design', 'Allergies', "Alzheimer's Disease", 'Ambient Intelligence', 'Analog signal processing', 'Ankylosing Spondylitis', 'Antisocial personality disorder', 'Anxiety', 'Asthma', 'Atopic Dermatitis', 'Atrial Fibrillation', 'Attention', 'Autism', 'Bioinformatics', 'Bipolar Disorder', 'Birth Control', 'Borderline personality disorder', 'Cancer', 'Cell biology', 'Child abuse', "Children's Health", 'Computer graphics', 'Computer programming', 'Computer vision', 'Construction Management', 'Control engineering', "Crohn's Disease", 'Cryptography', 'Data structures', 'Dementia', 'Depression', 'Diabetes', 'Digital control', 'Distributed computing', 'Eating disorders', 'Electric motor', 'Electrical circuits', 'Electrical generator', 'Electrical network', 'Electricity', 'Emergency Contraception', 'Enzymology', 'False memories', 'Fluid mechanics', 'Fungal Infection', 'Gender roles', 'Genetics', 'Geotextile', 'Green Building', 'HI

Device set to use cuda:0



>>> ZERO-SHOT (domain-level)


zero-shot:   9%|▉         | 10/110 [03:52<39:18, 23.59s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
zero-shot: 100%|██████████| 110/110 [41:26<00:00, 22.60s/it]


validation: {'zero_accuracy': 0.009867009867009866, 'zero_macro_f1': 0.007130797996498986}


zero-shot: 100%|██████████| 110/110 [41:05<00:00, 22.41s/it]


test:       {'zero_accuracy': 0.011011011011011011, 'zero_macro_f1': 0.00813028580260372}


Map: 100%|██████████| 32634/32634 [00:24<00:00, 1316.34 examples/s]
Map: 100%|██████████| 6993/6993 [00:05<00:00, 1326.87 examples/s]
Map: 100%|██████████| 6993/6993 [00:05<00:00, 1327.30 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at textattack/bert-base-uncased-MNLI and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([130, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([130]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.2152,1.12241,0.774346,0.750035
2,0.7788,0.869723,0.807522,0.799661
3,0.6125,0.857541,0.817532,0.808285
4,0.4789,0.89427,0.821393,0.81291
5,0.3693,0.984421,0.823109,0.81374
6,0.1842,1.029975,0.821393,0.812302
7,0.213,1.065495,0.827256,0.818446
8,0.1221,1.103335,0.828257,0.820075
9,0.097,1.145008,0.82554,0.817305
10,0.0552,1.156214,0.827399,0.819137



>>> POST-TUNING EVALUATION



=== VALIDATION ===
  accuracy: 0.0099 → 0.8274   (Δ +0.8175)
  macro_f1: 0.0071 → 0.8191   (Δ +0.8120)

=== TEST ===
  accuracy: 0.0110 → 0.8215   (Δ +0.8105)
  macro_f1: 0.0081 → 0.8114   (Δ +0.8033)
