In [1]:
#%pip install pandas
#%pip install torch
#%pip install datasets
#%pip install scikit-learn
#%pip install transformers
#%pip install accelerate
#%pip install bitsandbytes
#%pip install sentence_transformers
#%pip install protobuf 
#%pip install sentencepiece 
#%pip install peft


In [2]:
# train_and_infer_fixed.py
import json
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import GroupShuffleSplit
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    BitsAndBytesConfig
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch.nn as nn
import os

torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ----------------------
# 1. Load data (flatten)
# ----------------------
train_path = "data/train_v2.jsonl"
test_path = "data/test_v4.jsonl"

train_rows = [json.loads(line) for line in open(train_path, "r", encoding="utf-8")]
test_rows = [json.loads(line) for line in open(test_path, "r", encoding="utf-8")]

rows = []
for item in train_rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    for opt, is_correct in item["options"].items():
        rows.append({
            "text": text,
            "acronym": acronym,
            "option_text": opt.strip(),
            "label": int(is_correct)
        })

df = pd.DataFrame(rows)
df["group_id"] = (df["text"].str.strip() + "||" + df["acronym"].str.strip()).factorize()[0]
print("Total binary pairs:", len(df))
print("Positive count:", int(df["label"].sum()), "Negative count:", int((1 - df["label"]).sum()))

# ----------------------
# 2. Grouped split
# ----------------------
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(df, groups=df["group_id"]))
df_train = df.iloc[train_idx].reset_index(drop=True)
df_val   = df.iloc[val_idx].reset_index(drop=True)
print("Train size:", len(df_train), "Val size:", len(df_val))



Total binary pairs: 2177
Positive count: 433 Negative count: 1744
Train size: 1715 Val size: 462


In [4]:
# ----------------------
# 3. Tokenizer & preprocess (batched)
# ----------------------
model_name = "mistralai/Mistral-7B-Instruct-v0.3"   # try base first; swap to -large if GPU allows

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

def preprocess(batch):
    # batch is a dict of lists: batch["text"], batch["acronym"], batch["option_text"], batch["label"]
    inputs = [
        f"In the context: '{t}', what does the acronym '{a}' mean? Option: {o}"
        for t, a, o in zip(batch["text"], batch["acronym"], batch["option_text"])
    ]
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=256,
    )
    # tokenized is dict of lists: input_ids, attention_mask, token_type_ids (maybe)
    tokenized["labels"] = batch["label"]
    return tokenized

# Create HF datasets from pandas
ds_train = Dataset.from_pandas(df_train[["text", "acronym", "option_text", "label"]])
ds_val   = Dataset.from_pandas(df_val[["text", "acronym", "option_text", "label"]])

ds_train = ds_train.map(preprocess, batched=True, remove_columns=ds_train.column_names)
ds_val   = ds_val.map(preprocess, batched=True, remove_columns=ds_val.column_names)

# sanity-check: print a single example
print("Example tokenized train sample:")
print(ds_train[0])

# set format for trainer (Trainer will convert automatically but this is explicit)
ds_train.set_format(type="torch")
ds_val.set_format(type="torch")

# ----------------------
# 4. Model (CausalLM + classification head)
# ----------------------
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, PreTrainedModel

from transformers import AutoModelForCausalLM, PreTrainedModel

class CausalLMForClassification(PreTrainedModel):
    def __init__(self, model_name, quantization_config, num_labels=2):
        # Load base causal LM with eager attention
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            offload_folder="./offload",
            torch_dtype=torch.float16,
            attn_implementation="eager",  # âœ… fix here
        )

        # Initialize PreTrainedModel using base config
        super().__init__(base_model.config)

        self.model = base_model
        hidden_size = base_model.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        # Forward through LM backbone
        outputs = self.model.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False,
        )
        hidden = outputs.last_hidden_state  # [batch, seq, hidden]

        # Pool last valid token
        last_token_index = attention_mask.sum(dim=1) - 1
        pooled = hidden[torch.arange(hidden.size(0)), last_token_index]

        logits = self.classifier(pooled)
        out = {"logits": logits}

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
            loss = loss_fct(logits, labels)
            out["loss"] = loss

        return out

    # âœ… Forward generation calls to base model (needed for PEFT compatibility)
    def prepare_inputs_for_generation(self, *args, **kwargs):
        return self.model.prepare_inputs_for_generation(*args, **kwargs)

    def generate(self, *args, **kwargs):
        return self.model.generate(*args, **kwargs)


# Instantiate wrapped model
model = CausalLMForClassification(model_name, quantization_config)

# Prepare for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA setup
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


# ----------------------
# 5. Compute class weights for loss
# ----------------------
counts = df_train["label"].value_counts().to_dict()
n_pos = counts.get(1, 0)
n_neg = counts.get(0, 0)
pos_weight = (n_neg / (n_pos + 1e-12)) if n_pos > 0 else 1.0
print(f"pos_weight (neg/pos) = {pos_weight:.3f}")

class_weights = torch.tensor([1.0, pos_weight], dtype=torch.float)


# ----------------------
# 6. Custom Trainer (weighted loss)
# ----------------------
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Weighted CrossEntropyLoss for binary classification.
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs["logits"]

        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


# ----------------------
# 7. Metrics
# ----------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1m = f1_score(labels, preds, average="macro")
    # AUC
    try:
        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()
        auc = roc_auc_score(labels, probs[:, 1])
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "f1_macro": f1m, "roc_auc": auc}

# ----------------------
# 8. TrainingArguments + Trainer
# ----------------------
training_args = TrainingArguments(
    output_dir="./results_v5",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=30,
    learning_rate=2e-5,
    weight_decay=0.01,
    seed=42,
    logging_steps=50,
    save_total_limit=3,
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# ----------------------
# 9. Train
# ----------------------
trainer.train()
trainer.save_model("./results_v5")
tokenizer.save_pretrained("./results_v5")

# ----------------------
# 10. Inference (example)
# ----------------------
model.eval()
submission = []
for item in test_rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    options = item["options"]  # list

    inputs = [
        f"In the context: '{text}', what does the acronym '{acronym}' mean? Option: {opt}"
        for opt in options
    ]
    enc = tokenizer(inputs, truncation=True, padding=True, max_length=256, return_tensors="pt")
    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

    # rank descending, then filter by ascending indices + threshold example:
    ranked = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)
    # now apply your ascending order + threshold logic (example)
    threshold = 0.5
    selected = []
    last = -1
    for idx in ranked:
        if probs[idx] < threshold:
            continue
        if idx > last:
            selected.append(idx)
            last = idx
        else:
            break
    if not selected and len(ranked) > 0:
        selected = [ranked[0]]
    submission.append({"id": item["id"], "prediction": str(selected)})

pd.DataFrame(submission).to_csv("submission_v5.csv", index=False)
print("Saved submission_v5.csv")

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1715/1715 [00:00<00:00, 4575.12 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 462/462 [00:00<00:00, 7857.53 examples/s]


Example tokenized train sample:
{'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1328, 1040, 3526, 29515, 1232, 29525, 2210, 1371, 29476, 1155, 29475, 1064, 1402, 1135, 29527, 1415, 1535, 2003, 1040, 1951, 2457, 1850, 1232, 13580, 29510, 2840, 29572, 8432, 29515, 8969, 2059, 2057, 27474, 1482, 1053, 1307, 1517, 1402, 1052, 29479, 29493, 1705, 1826, 1234, 1402, 27660, 1108, 1052, 29479, 29493, 15804, 4798, 1093, 1030, 12700, 29499]

`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 3 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [01:59<00:00, 39.90s/it] 
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:19<00:00,  6.39s/it]
CausalLMForClassification has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
  trainer = WeightedTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation 

trainable params: 3,407,872 || all params: 7,251,439,618 || trainable%: 0.0470
pos_weight (neg/pos) = 4.000


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Roc Auc
1,1.6687,1.535957,0.794372,0.577505,0.664262
2,3.2392,1.722485,0.822511,0.623225,0.730989
3,0.0,,0.805195,0.446043,
4,0.0,,0.805195,0.446043,
5,0.0,,0.805195,0.446043,


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)