<a href="https://colab.research.google.com/github/TakeMingOff/US-AI-Patents/blob/main/Notebook%20E.%20Text%20Classification%20with%20New%20Transformer%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install -U "transformers" "datasets" "accelerate" "evaluate" "scikit-learn"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m133.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# --- 环境 & 随机种子 ---
import os, gc, random, numpy as np, pandas as pd, torch, inspect, torch.nn as nn
import json, shutil

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.backends.cuda.matmul.allow_tf32 = True
if hasattr(torch.backends, "cudnn"):
    torch.backends.cudnn.benchmark = True
random.seed(1); np.random.seed(1); torch.manual_seed(1)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(1)

# --- 与 Notebook D 路径保持一致 ---
os.chdir("/content/drive/MyDrive/USPTO_data")

# --- 读数据（列名与 D 完全一致）---
TrainingData   = pd.read_csv("./Training_Data/4K Patents - AI 20p.csv")
IDs            = np.array(TrainingData['app number'].values.tolist())
Abstract_Text  = TrainingData['abstract'].astype(str).values.tolist()
Classes        = TrainingData['actual'].values.tolist()

# --- 输出路径（与 D 一致） ---
os.makedirs("./Output/Model Performance", exist_ok=True)
os.makedirs("./Output/Classification Output", exist_ok=True)
RESULTS_CSV            = "./Output/Model Performance/New Transformer Classification Model Performance.csv"
CLASSIFIED_VALUES_CSV  = "./Output/Classification Output/New Transformer Classification Results.csv"
MODEL_DIR_FMT          = "./Output/Models/{model}/fold_{fold}"
BEST_ROOT = "./Output/Best Models"
os.makedirs(BEST_ROOT, exist_ok=True)

print("WD:", os.getcwd(), "  rows:", len(TrainingData))

# --- 依赖 ---
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
)
from packaging.version import parse as vparse
import transformers
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# 兼容新老 Transformers：4.46+ 用 eval_strategy
EVAL_KEY = "eval_strategy" if vparse(transformers.__version__) >= vparse("4.46.0") else "evaluation_strategy"

# --- 标签统一到 1 维 0/1（兼容 one-hot/概率列；若原本就是 0/1 会直接返回）---
def normalize_binary_labels(arr):
    a = np.asarray(arr)
    if a.ndim == 1 and a.dtype == object:
        try: a = np.stack(a)
        except Exception: pass
    if a.ndim == 2 and a.shape[1] == 2:
        if np.array_equal(a, a.astype(int)):   # one-hot
            a = a.argmax(axis=1).astype(int)
        else:                                   # 概率
            a = (a[:, 1] >= 0.5).astype(int)
    if a.ndim != 1:
        raise ValueError(f"标签仍不是 1 维：{a.shape}")
    return a.astype(int)

LABELS_1D = normalize_binary_labels(Classes)

# --- logits -> 正类概率（算 AUC 用）---
def logits_to_prob_pos(logits: np.ndarray) -> np.ndarray:
    x = np.array(logits)
    if x.ndim == 2 and x.shape[1] == 2:
        x = x - x.max(axis=1, keepdims=True)
        exp = np.exp(x); probs = exp / exp.sum(axis=1, keepdims=True)
        return probs[:, 1]
    else:
        z = x.squeeze()
        return 1.0 / (1.0 + np.exp(-z))

# --- HF Dataset & 分词 ---
def make_ds(texts, labels): return Dataset.from_dict({"text": list(texts), "label": list(labels)})
def tokenize_fn_builder(tok, max_len: int):
    def _encode(batch):
        return tok(batch["text"], truncation=True, padding="max_length", max_length=max_len)
    return _encode

# --- 指标函数（Trainer 会读到 eval_* 指标，供早停/最优模型判定）---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple): logits = logits[0]
    preds    = logits.argmax(-1)
    prob_pos = logits_to_prob_pos(logits)
    return {
        "accuracy":  accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall":    recall_score(labels, preds, zero_division=0),
        "f1":        f1_score(labels, preds, zero_division=0),
        "auc":       roc_auc_score(labels, prob_pos),
    }

# --- 自定义 Trainer：类权重 + label smoothing（更稳，提高 Recall/F1 更常见）---
class WeightedTrainer(Trainer):
    def __init__(self, class_weights: torch.Tensor | None = None, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs["labels"]
        forward_inputs = {k: v for k, v in inputs.items() if k != "labels"}
        outputs = model(**forward_inputs)
        logits  = outputs.logits.float()  # 用 float32 计算 CE 更稳
        if self.class_weights is not None:
            cw = self.class_weights.to(logits.device, dtype=torch.float32)
            loss_fct = nn.CrossEntropyLoss(weight=cw, label_smoothing=0.05)
        else:
            loss_fct = nn.CrossEntropyLoss(label_smoothing=0.05)
        loss = loss_fct(logits, labels.long())
        return (loss, outputs) if return_outputs else loss

# ========================= 可配部分 =========================
NUM_OF_SPLITS = 10
Reweight      = True

# 通用训练参数（含早停）
BASE_ARGS = dict(
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    logging_steps=20,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    report_to=[],
    dataloader_num_workers=0,
    dataloader_pin_memory=True,
    dataloader_persistent_workers=False,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_total_limit=1,
)
BASE_ARGS[EVAL_KEY] = "epoch"  # 每个 epoch 验证一次，用于早停

# 选择要对比的模型（可自行增删）
CLASSIFIERS = [
    ["DeBERTaV3-base", "microsoft/deberta-v3-base", 512],
    ["MPNet", "microsoft/mpnet-base", 512],
]
# ===========================================================

RESULTS, Classified_Values = [], []
supports_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
dtype_for_model = torch.bfloat16 if supports_bf16 else torch.float16

# ===== 放在循环外（比如 RESULTS, Classified_Values 定义附近）=====
CONF_MODE = "perclass"   # "counts" / "perclass" / "overall"
ROUND_CM  = 3            # 小数保留位数


from transformers import Trainer  # 用来检查 __init__ 的签名

for name, model_name, max_len in tqdm(CLASSIFIERS, desc="Evaluating Classifiers (E)", leave=True):
    best_fold_metric = -1.0
    best_fold_idx = None
    best_fold_model_dir = None
    best_fold_val_metric = None

    y_true_all, y_pred_all, prob_pos_all, id_all = [], [], [], []

    # 根据不同模型调显存/吞吐
    args_this = BASE_ARGS.copy()
    if supports_bf16:
        args_this["bf16"] = True;  args_this["fp16"] = False
    else:
        args_this["bf16"] = False; args_this["fp16"] = True

    lower = name.lower()
    if "modernbert" in lower:
        # 长上下文最省显存配置（将有效 batch≈32）
        args_this.update({
            "per_device_train_batch_size": 1,
            "per_device_eval_batch_size": 2,
            "gradient_accumulation_steps": 32,
            "gradient_checkpointing": True,
            "optim": "adamw_torch_fused",
            "remove_unused_columns": False,   # 避免丢 inputs
        })
    else:
        # 常规模型可稍微放开
        args_this.update({
            "gradient_accumulation_steps": 1,
            "optim": "adamw_torch_fused",
            "remove_unused_columns": False,
        })

    kf = StratifiedKFold(n_splits=NUM_OF_SPLITS, shuffle=True, random_state=1)
    for fold_idx, (train_i, test_i) in enumerate(
        tqdm(kf.split(Abstract_Text, LABELS_1D), desc=f"{name} | Cross-Validating", leave=False, total=NUM_OF_SPLITS), 1
    ):
        gc.collect(); torch.cuda.empty_cache()

        # 分层切分：K 折外层 + 10% 验证集（用于早停/挑最优）
        X, Y = np.array(Abstract_Text), LABELS_1D
        X_tr_full, y_tr_full = X[train_i].tolist(), Y[train_i].tolist()
        test_X, test_y       = X[test_i].tolist(),  Y[test_i].tolist()

        from sklearn.model_selection import train_test_split
        train_X, val_X, train_y, val_y = train_test_split(
            X_tr_full, y_tr_full, test_size=0.10, stratify=y_tr_full, random_state=42
        )

        tok    = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        encode = tokenize_fn_builder(tok, max_len=max_len)
        ds_train = make_ds(train_X, train_y).map(encode, batched=True, remove_columns=["text"])
        ds_val   = make_ds(val_X,   val_y  ).map(encode, batched=True, remove_columns=["text"])
        ds_eval  = make_ds(test_X,  test_y ).map(encode, batched=True, remove_columns=["text"])

        cols = ["input_ids", "attention_mask", "label"]
        if "token_type_ids" in ds_train.column_names: cols.append("token_type_ids")
        ds_train = ds_train.with_format("torch", columns=cols)
        ds_val   = ds_val.with_format("torch",   columns=cols)
        ds_eval  = ds_eval.with_format("torch",  columns=cols)

        # 类权重（不平衡更稳）
        class_weights = None
        if Reweight:
            cw = compute_class_weight(class_weight="balanced", classes=np.array([0,1]), y=train_y)
            class_weights = torch.tensor(cw, dtype=torch.float)

        out_dir = MODEL_DIR_FMT.format(model=name, fold=fold_idx)
        os.makedirs(out_dir, exist_ok=True)

        # 模型（训练精度与 dtype 对齐）
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=2, torch_dtype=dtype_for_model
        )
        model.config.use_cache = False  # 训练禁用 KV cache，省显存

        args = TrainingArguments(output_dir=out_dir, **args_this)

        # 兼容新老版本：优先传 processing_class，其次 tokenizer
        trainer_kwargs = dict(
            model=model, args=args,
            train_dataset=ds_train, eval_dataset=ds_val,
            compute_metrics=compute_metrics,
            class_weights=class_weights,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
        )
        if "processing_class" in inspect.signature(Trainer.__init__).parameters:
            trainer_kwargs["processing_class"] = tok
        else:
            trainer_kwargs["tokenizer"] = tok

        trainer = WeightedTrainer(**trainer_kwargs)
        trainer.train()

        # 折外测试集预测
        pred_out = trainer.predict(ds_eval)
        logits   = pred_out.predictions
        preds    = np.argmax(logits, axis=1).astype(int)
        prob_pos = logits_to_prob_pos(logits)

        # —— 计算该折测试集指标 ——（选择标准默认用 F1）
        fold_acc = accuracy_score(test_y, preds)
        fold_f1  = f1_score(test_y, preds, zero_division=0)
        fold_auc = roc_auc_score(test_y, prob_pos)
        val_best = float(trainer.state.best_metric) if trainer.state.best_metric is not None else None  # 验证集最优 F1

        # —— 把“当折的最优权重”另存为 fold_k/best_model/（更直观）——
        best_dir_this_fold = os.path.join(out_dir, "best_model")
        trainer.save_model(best_dir_this_fold)   # 权重+config
        tok.save_pretrained(best_dir_this_fold)  # 分词器

        # 记录当折说明（可选）
        with open(os.path.join(out_dir, "BEST_INFO.json"), "w", encoding="utf-8") as f:
            json.dump({
                "select_metric": "F1_test",
                "test_metrics": {"accuracy": float(fold_acc), "f1": float(fold_f1), "auc": float(fold_auc)},
                "val_best_metric_eval_f1": val_best,
                "best_model_checkpoint": trainer.state.best_model_checkpoint
            }, f, ensure_ascii=False, indent=2)

        # —— 用“测试集 F1”决定该模型当前最佳折 ——（若想改为 Accuracy，就把 fold_f1 换成 fold_acc）
        if fold_f1 > best_fold_metric:
            best_fold_metric = float(fold_f1)
            best_fold_idx = fold_idx
            best_fold_model_dir = best_dir_this_fold   # 指向该折的 best_model/
            best_fold_val_metric = val_best

        id_all.extend(IDs[test_i].tolist())
        y_true_all.extend(test_y)
        y_pred_all.extend(list(preds))
        prob_pos_all.extend(list(prob_pos))

        del trainer, model, tok
        gc.collect(); torch.cuda.empty_cache()

    # ===== 汇总（列顺序严格对齐 Notebook D）=====
    y_true_all = np.asarray(y_true_all)
    y_pred_all = np.asarray(y_pred_all)

    tn, fp, fn, tp = confusion_matrix(y_true_all, y_pred_all).ravel()

    if CONF_MODE == "counts":
        tp_v, fn_v, fp_v, tn_v = tp, fn, fp, tn
    elif CONF_MODE == "overall":
        total = tp + tn + fp + fn
        tp_v = tp / total if total else 0.0
        fn_v = fn / total if total else 0.0
        fp_v = fp / total if total else 0.0
        tn_v = tn / total if total else 0.0
    else:  # "perclass" —— 推荐：与常见指标一致
        P = tp + fn  # 实际正样本总数
        N = tn + fp  # 实际负样本总数
        tp_v = tp / P if P else 0.0   # TPR / Recall
        fn_v = fn / P if P else 0.0   # FNR
        fp_v = fp / N if N else 0.0   # FPR
        tn_v = tn / N if N else 0.0   # TNR

    Share     = float(np.round(y_pred_all.mean(), 3))
    Accuracy  = accuracy_score(y_true_all, y_pred_all)
    ROC       = roc_auc_score(y_true_all, prob_pos_all)
    Precision = precision_score(y_true_all, y_pred_all, zero_division=0)
    Recall    = recall_score(y_true_all, y_pred_all, zero_division=0)
    F1        = f1_score(y_true_all, y_pred_all, zero_division=0)

    RESULTS.append([
        name, Share,
        round(tp_v, ROUND_CM), round(fn_v, ROUND_CM),
        round(fp_v, ROUND_CM), round(tn_v, ROUND_CM),
        round(Accuracy, 3), round(ROC, 3),
        round(Precision, 3), round(Recall, 3), round(F1, 3)
    ])

    Classified_Values.append(list(zip(len(id_all)*[name], id_all, y_true_all.tolist(), y_pred_all.tolist())))

    # —— 汇总：把该模型“最好结果”的权重复制到 ./Output/Best Models/{name}/ ——
    if best_fold_model_dir is not None:
        dest = os.path.join(BEST_ROOT, name)  # 例如 ./Output/Best Models/DeBERTaV3-base
        if os.path.exists(dest):
            shutil.rmtree(dest)
        shutil.copytree(best_fold_model_dir, dest)

        # 写一份摘要
        with open(os.path.join(dest, "BEST_OF_MODEL.json"), "w", encoding="utf-8") as f:
            json.dump({
                "model_name": name,
                "chosen_fold": best_fold_idx,
                "select_metric": "F1_test",
                "best_fold_test_f1": best_fold_metric,
                "best_fold_val_f1": best_fold_val_metric,
                "source_dir": best_fold_model_dir
            }, f, ensure_ascii=False, indent=2)

        print(f"[{name}] Best fold = {best_fold_idx} (F1_test={best_fold_metric:.4f}). Copied to: {dest}")
    else:
        print(f"[{name}] WARNING: no best fold directory recorded.")


# ===== 写出两张表（与 Notebook D 完全一致）=====
RESULTS_TABLE = pd.DataFrame(
    RESULTS,
    columns=["Name","Share","True-Positives","False-Negatives","False-Positives","True-Negatives",
             "Accuracy","AUC","Precision","Recall","F1"]
)
RESULTS_TABLE["Type"] = "Transformer"
RESULTS_TABLE = RESULTS_TABLE[["Name","Type","Share","True-Positives","False-Negatives",
                               "False-Positives","True-Negatives","Accuracy","AUC","Precision","Recall","F1"]]

# 保存（和 D 一样不加 index；编码 utf-8-sig）
RESULTS_TABLE.sort_values("Accuracy", ascending=False).to_csv(RESULTS_CSV, index=False, encoding="utf-8-sig")

# 逐样本横表：第一列 id、第二列 Actual，后续每个模型一列 Predicted
Final = None
for bundle in Classified_Values:
    Temp = pd.DataFrame(bundle, columns=["Model","id","Actual","Predicted"])
    if Final is None:
        mname = Temp.head(1)["Model"].iloc[0]
        Final = Temp[["id","Actual","Predicted"]].rename(columns={"Predicted": mname})
    else:
        mname = Temp.head(1)["Model"].iloc[0]
        Final = Final.merge(Temp[["id","Predicted"]].rename(columns={"Predicted": mname}), on="id", how="outer")

Final.to_csv(CLASSIFIED_VALUES_CSV, index=False, encoding="utf-8-sig")

# 与 D 相同：展示一份按 Accuracy 排序的总表
RESULTS_TABLE.sort_values("Accuracy", ascending=False)
print("Saved:", os.path.abspath(RESULTS_CSV))
print("Saved:", os.path.abspath(CLASSIFIED_VALUES_CSV))

WD: /content/drive/MyDrive/USPTO_data   rows: 4000


Evaluating Classifiers (E):   0%|          | 0/2 [00:00<?, ?it/s]

DeBERTaV3-base | Cross-Validating:   0%|          | 0/10 [00:00<?, ?it/s]



Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.559,0.418473,0.922222,0.833333,0.763889,0.797101,0.968051
2,0.3351,0.295583,0.947222,0.819277,0.944444,0.877419,0.984761
3,0.2851,0.306161,0.933333,0.766667,0.958333,0.851852,0.98534
4,0.2605,0.299386,0.938889,0.790698,0.944444,0.860759,0.986859




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5033,0.70057,0.872222,1.0,0.361111,0.530612,0.915123
2,0.3131,0.310834,0.952778,0.887324,0.875,0.881119,0.988281
3,0.2978,0.2824,0.955556,0.841463,0.958333,0.896104,0.990283
4,0.2652,0.284976,0.958333,0.87013,0.930556,0.899329,0.990837
5,0.2219,0.290462,0.958333,0.87013,0.930556,0.899329,0.990693
6,0.2414,0.301585,0.958333,0.88,0.916667,0.897959,0.990693




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.4939,0.458746,0.836111,0.554622,0.916667,0.691099,0.952836
2,0.3241,0.330393,0.938889,0.828947,0.875,0.851351,0.972102
3,0.2997,0.344034,0.933333,0.785714,0.916667,0.846154,0.973356
4,0.2455,0.368309,0.944444,0.871429,0.847222,0.859155,0.971089
5,0.2252,0.373233,0.941667,0.859155,0.847222,0.853147,0.972319
6,0.2082,0.376803,0.941667,0.84,0.875,0.857143,0.97215




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5717,0.381257,0.858333,0.591304,0.944444,0.727273,0.962722
2,0.2946,0.30664,0.95,0.875,0.875,0.875,0.982398
3,0.2642,0.311044,0.95,0.846154,0.916667,0.88,0.984086
4,0.2596,0.32115,0.947222,0.853333,0.888889,0.870748,0.982133
5,0.2599,0.323143,0.955556,0.888889,0.888889,0.888889,0.974778
6,0.2304,0.325247,0.961111,0.926471,0.875,0.9,0.972415
7,0.2308,0.326288,0.955556,0.888889,0.888889,0.888889,0.972415
8,0.2638,0.323066,0.958333,0.901408,0.888889,0.895105,0.971113




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5544,0.353168,0.888889,0.650943,0.958333,0.775281,0.963228
2,0.3019,0.275294,0.969444,0.906667,0.944444,0.92517,0.989993
3,0.2947,0.270101,0.963889,0.883117,0.944444,0.912752,0.991922
4,0.2769,0.268545,0.975,0.931507,0.944444,0.937931,0.991102
5,0.2323,0.269892,0.975,0.931507,0.944444,0.937931,0.992597
6,0.2299,0.267566,0.977778,0.944444,0.944444,0.944444,0.993152
7,0.2277,0.266305,0.977778,0.944444,0.944444,0.944444,0.993514
8,0.2518,0.268862,0.972222,0.918919,0.944444,0.931507,0.993273




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.516,0.410639,0.922222,0.782051,0.847222,0.813333,0.957248
2,0.2831,0.314957,0.961111,0.902778,0.902778,0.902778,0.985147
3,0.3039,0.284393,0.958333,0.860759,0.944444,0.900662,0.987606
4,0.2757,0.293612,0.958333,0.88,0.916667,0.897959,0.98763




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5606,0.446701,0.819444,0.528,0.916667,0.670051,0.947049
2,0.3038,0.354443,0.927778,0.802632,0.847222,0.824324,0.973259
3,0.306,0.344167,0.922222,0.755814,0.902778,0.822785,0.974995
4,0.2759,0.360876,0.936111,0.826667,0.861111,0.843537,0.978998
5,0.2202,0.364986,0.933333,0.815789,0.861111,0.837838,0.981988
6,0.2253,0.359227,0.930556,0.790123,0.888889,0.836601,0.981891




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5307,0.485008,0.897222,0.753623,0.722222,0.737589,0.91657
2,0.3538,0.320398,0.930556,0.752688,0.972222,0.848485,0.981578
3,0.2799,0.344783,0.933333,0.792683,0.902778,0.844156,0.978926
4,0.2639,0.360296,0.930556,0.764045,0.944444,0.84472,0.979552




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5377,0.416761,0.852778,0.577236,0.986111,0.728205,0.962939
2,0.3193,0.298943,0.958333,0.88,0.916667,0.897959,0.991271
3,0.2889,0.390326,0.888889,0.645455,0.986111,0.78022,0.991247
4,0.2508,0.293819,0.941667,0.793103,0.958333,0.867925,0.99267




Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5047,0.422312,0.919444,0.786667,0.819444,0.802721,0.94307
2,0.3403,0.332999,0.947222,0.853333,0.888889,0.870748,0.972198
3,0.3211,0.34552,0.919444,0.736264,0.930556,0.822086,0.972246
4,0.2868,0.34451,0.938889,0.820513,0.888889,0.853333,0.974175


[DeBERTaV3-base] Best fold = 7 (F1_test=0.9157). Copied to: ./Output/Best Models/DeBERTaV3-base


MPNet | Cross-Validating:   0%|          | 0/10 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/532M [00:00<?, ?B/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5218,0.40682,0.875,0.628571,0.916667,0.745763,0.970727
2,0.3709,0.314301,0.947222,0.82716,0.930556,0.875817,0.987413
3,0.3363,0.397087,0.841667,0.56,0.972222,0.71066,0.969956
4,0.3133,0.306772,0.941667,0.8,0.944444,0.866242,0.987534


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.4912,0.431248,0.886111,0.663158,0.875,0.754491,0.954234
2,0.3698,0.313223,0.95,0.8375,0.930556,0.881579,0.988595
3,0.333,0.328747,0.916667,0.714286,0.972222,0.823529,0.987052
4,0.2936,0.310368,0.936111,0.788235,0.930556,0.853503,0.98845


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.4926,0.431857,0.852778,0.584071,0.916667,0.713514,0.954958
2,0.3774,0.38967,0.861111,0.596491,0.944444,0.731183,0.972753
3,0.3597,0.344937,0.891667,0.660194,0.944444,0.777143,0.981216
4,0.2865,0.334455,0.947222,0.853333,0.888889,0.870748,0.977648
5,0.2591,0.33165,0.944444,0.825,0.916667,0.868421,0.97825
6,0.2674,0.325767,0.944444,0.825,0.916667,0.868421,0.979818


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5139,0.430756,0.827778,0.539062,0.958333,0.69,0.952426
2,0.3575,0.363276,0.891667,0.663366,0.930556,0.774566,0.975839
3,0.3191,0.372572,0.875,0.628571,0.916667,0.745763,0.977165
4,0.3328,0.345018,0.938889,0.828947,0.875,0.851351,0.978154
5,0.2828,0.349835,0.922222,0.75,0.916667,0.825,0.975911
6,0.2508,0.362854,0.938889,0.837838,0.861111,0.849315,0.975863


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5107,0.445318,0.791667,0.48951,0.972222,0.651163,0.960455
2,0.3504,0.337995,0.911111,0.712766,0.930556,0.807229,0.982711
3,0.3604,0.316522,0.936111,0.788235,0.930556,0.853503,0.984447
4,0.3047,0.298421,0.958333,0.87013,0.930556,0.899329,0.987678
5,0.2714,0.292223,0.963889,0.893333,0.930556,0.911565,0.987172
6,0.2705,0.307021,0.930556,0.770115,0.930556,0.842767,0.986931
7,0.2516,0.291303,0.966667,0.916667,0.916667,0.916667,0.987534
8,0.2951,0.298393,0.955556,0.858974,0.930556,0.893333,0.986521
9,0.2391,0.296753,0.955556,0.858974,0.930556,0.893333,0.98669


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5009,0.413543,0.911111,0.722222,0.902778,0.802469,0.954692
2,0.3463,0.334101,0.95,0.864865,0.888889,0.876712,0.977937
3,0.3853,0.360357,0.891667,0.660194,0.944444,0.777143,0.97825
4,0.3142,0.338053,0.913889,0.715789,0.944444,0.814371,0.980903


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5122,0.444336,0.838889,0.559322,0.916667,0.694737,0.942732
2,0.3619,0.373865,0.922222,0.755814,0.902778,0.822785,0.968485
3,0.355,0.350475,0.930556,0.783133,0.902778,0.83871,0.97227
4,0.3124,0.345721,0.927778,0.767442,0.916667,0.835443,0.973259
5,0.2788,0.365403,0.936111,0.810127,0.888889,0.847682,0.973548
6,0.2366,0.358981,0.925,0.764706,0.902778,0.828025,0.972102
7,0.2583,0.35757,0.922222,0.744444,0.930556,0.82716,0.971668


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.4943,0.451723,0.797222,0.496296,0.930556,0.647343,0.941816
2,0.3459,0.347949,0.922222,0.744444,0.930556,0.82716,0.976321
3,0.3315,0.347618,0.913889,0.72043,0.930556,0.812121,0.978684
4,0.2886,0.358993,0.916667,0.728261,0.930556,0.817073,0.979432


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.5187,0.420092,0.902778,0.690722,0.930556,0.792899,0.954162
2,0.3387,0.340581,0.952778,0.866667,0.902778,0.884354,0.984616
3,0.3158,0.331618,0.916667,0.723404,0.944444,0.819277,0.984616
4,0.3042,0.312945,0.925,0.747253,0.944444,0.834356,0.987003


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.4901,0.474929,0.802778,0.503817,0.916667,0.650246,0.925974
2,0.3915,0.383551,0.936111,0.84507,0.833333,0.839161,0.964506
3,0.328,0.376783,0.905556,0.706522,0.902778,0.792683,0.967183
4,0.3344,0.366245,0.936111,0.818182,0.875,0.845638,0.970583
5,0.2682,0.376683,0.933333,0.833333,0.833333,0.833333,0.972102
6,0.2587,0.376797,0.911111,0.727273,0.888889,0.8,0.973428


[MPNet] Best fold = 7 (F1_test=0.9112). Copied to: ./Output/Best Models/MPNet
Saved: /content/drive/MyDrive/USPTO_data/Output/Model Performance/New Transformer Classification Model Performance.csv
Saved: /content/drive/MyDrive/USPTO_data/Output/Classification Output/New Transformer Classification Results.csv
