
# DeBERTa v3 — Phase 3 Fine-tune (CropCare)

This notebook fine-tunes a text classifier on your farmer-style descriptions using the datasets you copied into:

```
/home/myid/bp67339/plant_disease/data/
  - train_clean_step1.jsonl
  - train_phase3_augmented.jsonl
  - train_phase3_pairs.jsonl
  - val_clean_step1.jsonl   (optional; if missing we do a 90/10 split)
```

**Outputs** are saved to:

```
/home/myid/bp67339/plant_disease/models/deberta_v3_base_textclf_phase3/
```
and include `model.safetensors`, tokenizer files, and `labels.json` so your FastAPI app can load it.


In [1]:
import sys, inspect
import accelerate, transformers
from accelerate import Accelerator

print("transformers:", transformers.__version__)
print("accelerate  :", accelerate.__version__)
print("accelerate file:", accelerate.__file__)
print("unwrap_model signature:", inspect.signature(Accelerator.unwrap_model))
print("sys.path[0:5]:", sys.path[:5])

  from .autonotebook import tqdm as notebook_tqdm


transformers: 4.57.1
accelerate  : 1.0.1
accelerate file: /home/myid/bp67339/trainenv/lib/python3.10/site-packages/accelerate/__init__.py
unwrap_model signature: (self, model, keep_fp32_wrapper: 'bool' = True)
sys.path[0:5]: ['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/myid/bp67339/trainenv/lib/python3.10/site-packages']


In [2]:
import inspect
from accelerate import Accelerator
print("accelerate:", __import__("accelerate").__version__)
print(inspect.signature(Accelerator.unwrap_model))

accelerate: 1.0.1
(self, model, keep_fp32_wrapper: 'bool' = True)


In [3]:
import torch, numpy as np, pandas as pd
import transformers, datasets, accelerate, evaluate, sklearn

print("CUDA:", torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print("torch:", torch.__version__)
print("numpy:", np.__version__, "pandas:", pd.__version__)
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("accelerate:", accelerate.__version__)
print("evaluate:", evaluate.__version__)
print("sklearn:", sklearn.__version__)

CUDA: True NVIDIA GeForce RTX 3090
torch: 2.2.2+cu118
numpy: 1.26.4 pandas: 2.2.2
transformers: 4.57.1
datasets: 2.19.1
accelerate: 1.0.1
evaluate: 0.4.2
sklearn: 1.5.2



## 1) Imports, paths, and config


In [4]:

from pathlib import Path
import os, json, random, numpy as np, torch

from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score

# Paths (anchored to your plant_disease folder)
HOME     = Path("/home/myid/bp67339")
ROOT     = HOME / "plant_disease"
DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "models" / "deberta_v3_base_textclf_phase3"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Prefer local checkpoints; fall back to HF base if not present
LOCAL_BASES = [
    ROOT / "models" / "deberta_v3_base_textclf_phase2",
    ROOT / "models" / "deberta_v3_base_textclf",
]
HF_BASE = "microsoft/deberta-v3-base"

# Training data files (use whichever exist)
TRAIN_FILES = [
    DATA_DIR / "train_clean_step1.jsonl",
    DATA_DIR / "train_phase3_augmented.jsonl",
    DATA_DIR / "train_phase3_pairs.jsonl",
]
VAL_FILE = DATA_DIR / "val_clean_step1.jsonl"

# Hyperparams
MAX_LEN   = 256
SEED      = 42
LR        = 2e-5
EPOCHS    = 3
BATCH_T   = 16
BATCH_E   = 32
LOG_STEPS = 50

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda



## 2) Seed and helpers


In [5]:

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def load_jsonl(paths):
    ps = [str(p) for p in paths if Path(p).is_file()]
    if not ps: 
        return None
    return load_dataset("json", data_files={"data": ps})["data"]

def pick_base_checkpoint():
    for p in LOCAL_BASES:
        if p.is_dir():
            return str(p), True  # local_only
    return HF_BASE, False

set_seed(SEED)



## 3) Load datasets
Each JSONL row should be like:
```json
{"text": "brown circular spots...", "label": "Cercospora Leaf Spot"}
```


In [6]:

train_parts = []
for fp in TRAIN_FILES:
    ds = load_jsonl([fp])
    if ds is not None:
        train_parts.append(ds)
        print(f"[data] using: {fp}")
    else:
        print(f"[data] missing: {fp}")

if not train_parts:
    raise SystemExit("No training files found under plant_disease/data/")

ds_train = concatenate_datasets(train_parts) if len(train_parts) > 1 else train_parts[0]
ds_eval  = load_jsonl([VAL_FILE])

if ds_eval is None:
    print("[data] no explicit val file; creating 10% split from train")
    split = ds_train.train_test_split(test_size=0.10, seed=SEED)
    ds_train, ds_eval = split["train"], split["test"]

for name, ds in [("train", ds_train), ("eval", ds_eval)]:
    assert "text" in ds.column_names and "label" in ds.column_names, f"{name} needs 'text' and 'label'"
print("Sizes:", len(ds_train), len(ds_eval))


[data] using: /home/myid/bp67339/plant_disease/data/train_clean_step1.jsonl
[data] using: /home/myid/bp67339/plant_disease/data/train_phase3_augmented.jsonl
[data] using: /home/myid/bp67339/plant_disease/data/train_phase3_pairs.jsonl
Sizes: 140843 7675



## 4) Labels and tokenization


In [7]:
# --- Normalize text into a single column ---------------------------------------
import math
from transformers import AutoTokenizer

def _as_str(x):
    if x is None: return ""
    if isinstance(x, float) and math.isnan(x): return ""
    if isinstance(x, (int, float, bool)): return str(x)
    if isinstance(x, list): return " ".join(str(t) for t in x if t is not None)
    if isinstance(x, dict):
        for k in ("text", "variant", "orig", "clean", "message", "description"):
            if k in x and x[k] not in (None, float("nan")):
                return _as_str(x[k])
        return ""
    return str(x)

def normalize_text_row(ex):
    v = ex.get("text") or ex.get("variant") or ex.get("orig")
    s = _as_str(v)
    s = " ".join(s.split())
    return {"text_norm": s}

ds_train = ds_train.map(normalize_text_row, desc="normalize train text")
ds_eval  = ds_eval.map(normalize_text_row,  desc="normalize eval text")

# Drop empties just in case
ds_train = ds_train.filter(lambda ex: len(ex["text_norm"]) > 0, desc="drop empty train")
ds_eval  = ds_eval.filter(lambda ex: len(ex["text_norm"])  > 0, desc="drop empty eval")

# --- Build stable class list from TRAIN ONLY -----------------------------------
labels   = sorted(set(ds_train["label"]))
label2id = {n: i for i, n in enumerate(labels)}
id2label = {i: n for n, i in label2id.items()}

def map_labels(ex):
    return {"label": label2id[ex["label"]]}

ds_train = ds_train.map(map_labels, desc="map train labels")

# keep only eval rows whose labels exist in train (guard against drift)
ds_eval  = ds_eval.filter(lambda ex: ex["label"] in label2id, desc="keep eval known labels")
ds_eval  = ds_eval.map(map_labels, desc="map eval labels")

# --- Tokenizer -----------------------------------------------------------------
base_ckpt, local_only = pick_base_checkpoint()
print(f"[model] base: {base_ckpt} (local_only={local_only})")
tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=False, local_files_only=local_only)

def tok_fn(batch):
    return tok(batch["text_norm"], truncation=True, max_length=MAX_LEN)

# remove everything except label + text_norm (tokenizer will replace text_norm with tensors)
cols_tr = [c for c in ds_train.column_names if c not in ("label", "text_norm")]
cols_ev = [c for c in ds_eval.column_names  if c not in ("label", "text_norm")]

ds_train_tok = ds_train.map(tok_fn, batched=True, remove_columns=cols_tr, desc="tokenize train")
ds_eval_tok  = ds_eval.map(tok_fn,  batched=True, remove_columns=cols_ev,  desc="tokenize eval")

# --- Sanity checks -------------------------------------------------------------
assert "input_ids" in ds_train_tok.features, "tokenization failed (train)"
assert "input_ids" in ds_eval_tok.features,  "tokenization failed (eval)"
print("✅ tokenized fields:", list(ds_train_tok.features.keys()))
print("sizes:", len(ds_train_tok), len(ds_eval_tok), "num_labels:", len(labels))

[model] base: /home/myid/bp67339/plant_disease/models/deberta_v3_base_textclf_phase2 (local_only=True)
✅ tokenized fields: ['label', 'text_norm', 'input_ids', 'token_type_ids', 'attention_mask']
sizes: 140843 7675 num_labels: 6


In [8]:
import inspect, types
from accelerate import Accelerator
import accelerate

print("accelerate:", accelerate.__version__)
print("unwrap_model signature:", inspect.signature(Accelerator.unwrap_model))

# Patch only if unwrap_model doesn't accept **kwargs
sig = inspect.signature(Accelerator.unwrap_model)
has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values())
if not has_var_kw:
    _orig_unwrap = Accelerator.unwrap_model
    def _unwrap_compat(self, model, *args, **kwargs):
        # Ignore unknown kwargs e.g. keep_torch_compile from transformers
        return _orig_unwrap(self, model, *args)
    Accelerator.unwrap_model = types.MethodType(_unwrap_compat, Accelerator)
    print("✅ Patched accelerate.Accelerator.unwrap_model to ignore extra kwargs.")
else:
    print("✅ unwrap_model already accepts **kwargs; no patch needed.")

accelerate: 1.0.1
unwrap_model signature: (self, model, keep_fp32_wrapper: 'bool' = True)
✅ Patched accelerate.Accelerator.unwrap_model to ignore extra kwargs.



## 5) Model, Trainer, and training


In [9]:
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import torch

# ---- model --------------------------------------------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    base_ckpt,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    torch_dtype="auto",
    local_files_only=local_only,
).to(device)

# ---- metrics ------------------------------------------------------------
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    acc = accuracy_score(p.label_ids, preds)
    f1m = f1_score(p.label_ids, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "f1_macro": f1m}

collate = DataCollatorWithPadding(tokenizer=tok)

# ---- build args dict compatibly -----------------------------------------
# Some TF versions use "evaluation_strategy", some "eval_strategy".
candidate_args = {
    "output_dir": str(OUT_DIR),
    "learning_rate": LR,
    "per_device_train_batch_size": BATCH_T,
    "per_device_eval_batch_size": BATCH_E,
    "num_train_epochs": EPOCHS,
    # both names; we'll keep whichever your version supports
    "evaluation_strategy": "epoch",
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "f1_macro",
    "logging_steps": LOG_STEPS,
    # mixed precision (prefer bf16 on Ampere+; else fp16 if CUDA; else off)
    "bf16": (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8),
    "fp16": (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8),
    "report_to": [],                 # disable wandb/etc
    "save_safetensors": True,
}

# keep only fields TrainingArguments actually supports
field_names = set(getattr(TrainingArguments, "__dataclass_fields__", {}).keys())
filtered_args = {k: v for k, v in candidate_args.items() if k in field_names}

# if neither eval field existed, you’ll just do no periodic eval (fine)
args = TrainingArguments(**filtered_args)

# ---- trainer ------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tok,
    data_collator=collate,
    train_dataset=ds_train_tok,
    eval_dataset=ds_eval_tok,
    compute_metrics=compute_metrics,
)

trainer.train()

`torch_dtype` is deprecated! Use `dtype` instead!
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.0152,0.151142,0.97342,0.971455
2,0.0056,0.151098,0.978893,0.97717
3,0.0,0.115406,0.984235,0.982919




TrainOutput(global_step=13206, training_loss=0.020729483354480438, metrics={'train_runtime': 4510.5979, 'train_samples_per_second': 93.675, 'train_steps_per_second': 2.928, 'total_flos': 2.341359158497128e+16, 'train_loss': 0.020729483354480438, 'epoch': 3.0})


## 6) Save artifacts (model, tokenizer, labels.json)


In [10]:

tok.save_pretrained(OUT_DIR)
trainer.save_model(OUT_DIR)

with open(OUT_DIR / "labels.json", "w") as f:
    json.dump({"labels": labels}, f, indent=2)

print("Saved to:", OUT_DIR)
print([p.name for p in OUT_DIR.iterdir()])


Saved to: /home/myid/bp67339/plant_disease/models/deberta_v3_base_textclf_phase3
['checkpoint-8804', 'tokenizer_config.json', 'labels.json', 'model.safetensors', 'checkpoint-4402', 'config.json', 'checkpoint-13206', 'spm.model', 'special_tokens_map.json', 'added_tokens.json', 'training_args.bin']



## 7) Sanity check: reload and one example inference


In [11]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

tok2 = AutoTokenizer.from_pretrained(OUT_DIR, use_fast=False, local_files_only=True)
mdl2 = AutoModelForSequenceClassification.from_pretrained(OUT_DIR, local_files_only=True).to(device).eval()

sample = "brown circular spots with yellow halo"
enc = tok2(sample, return_tensors="pt", truncation=True, max_length=MAX_LEN).to(device)

with torch.no_grad():
    logits = mdl2(**enc).logits
probs = F.softmax(logits, dim=-1).squeeze(0).tolist()

top_i = int(np.argmax(probs))
print("pred:", labels[top_i], "p=", round(probs[top_i], 3))


pred: Cercospora Leaf Spot p= 1.0
