In [None]:
# === CELL 1 (v24 - + Argument Generation Loss) ===
import re
import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, pipeline
from peft import LoraConfig, get_peft_model, TaskType
from random import sample, random
from transformers import AutoTokenizer as AutoTokenizerNLI, AutoModelForSequenceClassification
import wandb
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import f1_score

# Load tokenizer and paraphraser globally
tokenizer = T5Tokenizer.from_pretrained("t5-base")
paraphraser = pipeline("text2text-generation", model="ramsrigouthamg/t5_paraphraser")

# Load NLI model
nli_tokenizer = AutoTokenizerNLI.from_pretrained("facebook/bart-large-mnli")
nli_model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli").eval()

# Load SBERT for semantic embedding
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def distinct_ngrams(texts, n=2):
    all_ngrams = []
    for text in texts:
        tokens = text.split()
        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
        all_ngrams.extend(ngrams)
    unique_ngrams = len(set(all_ngrams))
    total_ngrams = len(all_ngrams)
    return unique_ngrams / total_ngrams if total_ngrams > 0 else 0

def compute_metrics(eval_pred, lam):
    print("🧪 Starting evaluation...")
    predictions = torch.argmax(torch.tensor(eval_pred.predictions[0]), dim=-1)
    labels = torch.tensor(eval_pred.label_ids)

    # Decode predictions and labels
    predicted_texts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels_filtered = [[token if token != -100 else tokenizer.pad_token_id for token in seq] for seq in labels.cpu().numpy()]
    true_texts = tokenizer.batch_decode(labels_filtered, skip_special_tokens=True)

    # Semantic similarity
    pred_emb = sbert_model.encode(predicted_texts, convert_to_tensor=True)
    true_emb = sbert_model.encode(true_texts, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(pred_emb, true_emb)
    semantic_score = cosine_scores.mean().item()

    # Diversity score (distinct-2)
    diversity_score = distinct_ngrams(predicted_texts, n=2)

    # Word-level F1
    all_f1 = []
    for pred, label in zip(predicted_texts, true_texts):
        pred_tokens = pred.split()
        label_tokens = label.split()
        y_true = [1 if t in label_tokens else 0 for t in pred_tokens]
        y_pred = [1]*len(pred_tokens)
        if len(y_true) > 0:
            f1 = f1_score(y_true, y_pred, zero_division=1)
            all_f1.append(f1)
    avg_f1 = sum(all_f1) / len(all_f1) if all_f1 else 0.0

    final_score = (semantic_score * lam) + (diversity_score * 1-lam)

    return {
        "semantic_score": round(semantic_score, 4),
        "diversity_score": round(diversity_score, 4),
        "avg_f1": round(avg_f1, 4),
        "final_score": round(1 - final_score, 4)
    }

@torch.no_grad()
def nli_contradiction_loss(premises, hypotheses):
    losses = []
    for premise, hypo in zip(premises, hypotheses):
        inputs = nli_tokenizer(premise, hypo, return_tensors="pt", truncation=True, padding=True)
        outputs = nli_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        contradiction_prob = probs[:, 2]  # label 2 = contradiction
        loss = 1.0 - contradiction_prob.mean()
        losses.append(loss)
    return torch.stack(losses).mean()

@torch.no_grad()
def topic_relevance_loss(topics, generations):
    losses = []
    for topic, gen in zip(topics, generations):
        inputs = nli_tokenizer(topic, gen, return_tensors="pt", truncation=True, padding=True)
        outputs = nli_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        entail_prob = probs[:, 0]  # label 0 = entailment
        loss = 1.0 - entail_prob.mean()
        losses.append(loss)
    return torch.stack(losses).mean()

@torch.no_grad()
def semantic_similarity_loss(refs, hypos):
    losses = []
    for r, h in zip(refs, hypos):
        inputs = nli_tokenizer(r, h, return_tensors="pt", truncation=True, padding=True)
        outputs = nli_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        similarity_prob = probs[:, 0]  # entailment
        losses.append(similarity_prob.mean())
    return 1.0 - torch.stack(losses).mean()

# === New: Argument generation detection using simple keyword pattern ===
def argument_presence_loss(paragraphs):
    keywords = ["because", "as a result", "due to", "this means", "this is because", "for example", "for instance"]
    losses = []
    for para in paragraphs:
        score = any(k in para.lower() for k in keywords)
        loss = 0.0 if score else 1.0
        losses.append(torch.tensor(loss))
    return torch.stack(losses).mean()

def lexical_diversity_loss(labels, pad_token_id=0):
    losses = []
    for seq in labels:
        words = [t for t in seq if t != pad_token_id]
        unique = len(set(words))
        total = len(words)
        penalty = 1.0 - unique / total if total > 0 else 0.0
        losses.append(torch.tensor(penalty, device=labels.device))
    return torch.stack(losses).mean()

def repetition_overlap_loss(body1s, body2s):
    losses = []
    for b1, b2 in zip(body1s, body2s):
        set1 = set(b1.lower().split())
        set2 = set(b2.lower().split())
        overlap = len(set1 & set2) / max(1, len(set2))
        losses.append(torch.tensor(overlap))
    return torch.stack(losses).mean()

def ngram_overlap_loss(sequences, n=3):
    losses = []
    for seq in sequences:
        tokens = seq.lower().split()
        ngrams = set(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))
        losses.append(torch.tensor(1.0 - len(ngrams) / max(1, len(tokens)), device='cpu'))
    return torch.stack(losses).mean()

def argument_distance_loss(body1s, body2s):
    return semantic_similarity_loss(body1s, body2s)  # Higher similarity → higher loss

def dynamic_mask_input(text, tokenizer, mask_rate=0.15):
    tokens = tokenizer.tokenize(text)
    if len(tokens) < 4:
        return text
    num_to_mask = max(1, int(len(tokens) * mask_rate))
    for i in sample(range(len(tokens)), num_to_mask):
        tokens[i] = "<extra_id_0>"
    return tokenizer.convert_tokens_to_string(tokens)

def t5_paraphrase_text(text):
    result = paraphraser(f"paraphrase: {text} </s>", max_length=128, num_return_sequences=1, do_sample=True)
    return result[0]["generated_text"] if result else text

# === Load and prepare dataset ===
raw_dataset = load_dataset("chillies/IELTS-writing-task-2-evaluation", split="train")

def is_valid(example):
    try:
        band = float(re.sub(r"[^\d.]", "", example["band"]))
        return band >= 7.0 and example["essay"] and len(example["essay"].split()) > 220
    except:
        return False

filtered = [ex for ex in raw_dataset if is_valid(ex)]

def split_paragraphs_flex(essay):
    paras = [p.strip() for p in re.split(r"\n{2,}", essay.strip()) if p.strip()]
    return paras[0], paras[1], paras[2], paras[-1] if len(paras) >= 4 else None

split_data = []
for ex in filtered:
    try:
        result = split_paragraphs_flex(ex["essay"])
        if result is None:
            continue
        intro, body1, body2, conclusion = result
        if all(len(p.split()) > t for p, t in zip([intro, body1, body2, conclusion], [40, 60, 70, 35])) and ex["prompt"][:30] not in intro:
            set1, set2 = set(body1.lower().split()), set(body2.lower().split())
            if len(set1 & set2) / max(1, len(set2)) < 0.7:
                split_data.append({
                    "prompt": ex["prompt"].strip(),
                    "intro": intro.strip(),
                    "body1": body1.strip(),
                    "body2": body2.strip(),
                    "conclusion": conclusion.strip()
                })
    except:
        continue

print("\n📊 Filtered Samples:", len(split_data))

# === Define train function ===
def train_paragraph_model(field, save_dir, max_target_length=256):
    wandb.init(project="nlp_project2", name=f"{field}_v24")

    print(f"\n🚀 Training for: {field.upper()}", flush=True)
    data = []
    for ex in split_data:
        if len(ex["prompt"]) < 10 or len(ex[field]) < 30:
            continue
        prompt = dynamic_mask_input(ex["prompt"], tokenizer) if random() < 0.5 else ex["prompt"]
        if field == "intro":
            input_text = f"Write a short and clear INTRODUCTION:\n\n{prompt}\n\n- Paraphrase topic\n- State opinion\n- Brief background"
        elif field == "body1":
            input_text = f"Write the FIRST BODY PARAGRAPH for:\n\n{prompt}\n\n- Clear argument\n- Specific example\n- Logical explanation"
        elif field == "body2":
            intro = dynamic_mask_input(ex["intro"], tokenizer) if random() < 0.5 else ex["intro"]
            body1_masked = dynamic_mask_input(ex["body1"], tokenizer) if random() < 0.3 else ex["body1"]
            topic_masked = dynamic_mask_input(prompt, tokenizer) if random() < 0.3 else prompt
            input_text = (
                f"Write the SECOND BODY PARAGRAPH that presents a CONTRASTING perspective.\n\n"
                f"TOPIC: {topic_masked}\n\nINTRO: {intro}\n\nBODY 1: {body1_masked}\n\n"
                "Requirements:\n- Start with a contrast linker\n- Opposing idea\n- Specific example\n- Avoid repeating Body 1"
            )
        elif field == "conclusion":
            intro = t5_paraphrase_text(ex["intro"]) if random() < 0.5 else ex["intro"]
            input_text = (
                f"Write a CONCLUSION:\n\nTOPIC: {prompt}\n\nINTRO (paraphrased): {intro}\n\n"
                "Instructions:\n- Restate opinion\n- Summarise main points\n- End strongly"
            )
        data.append({
            "input_text": input_text,
            "target_text": ex[field],
            "intro": ex["intro"],
            "body1": ex["body1"],
            "body2": ex["body2"],
            "topic": ex["prompt"]
        })

    dataset = Dataset.from_list(data).train_test_split(test_size=0.1, seed=42)

    def tokenize_fn(batch):
        inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=512)
        targets = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=max_target_length)
        inputs["labels"] = targets["input_ids"]
        if field in ["body2", "conclusion"]:
            intros = tokenizer(batch["intro"], padding="max_length", truncation=True, max_length=256)
            inputs["intro"] = intros["input_ids"]
        if field in ["body1", "body2"]:
            inputs["body1_text"] = batch["body1"]
        if field == "body2":
            inputs["body2_text"] = batch["body2"]
            inputs["topic"] = batch["topic"]
        return inputs

    tokenized = dataset.map(tokenize_fn, batched=True)

    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    lora = LoraConfig(r=32, lora_alpha=64, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type=TaskType.SEQ_2_SEQ_LM)
    model = get_peft_model(model, lora)

    # === Inside dpo_loss: add argument_presence_loss ===
    # === Inside dpo_loss: add argument_presence_loss for body1 and body2 ===
    def dpo_loss(logits, labels, intros=None, body1_text=None, body2_text=None, topic_text=None, pad_token_id=0):
        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)
        mask = labels != pad_token_id
        base = F.cross_entropy(logits[mask], labels[mask]) if mask.any() else torch.tensor(0.0, device=logits.device)
        l_lex = lexical_diversity_loss(labels.view(1, -1), pad_token_id)
        l_contra = nli_contradiction_loss(body1_text, body2_text) if body1_text is not None else 0.0
        l_topic = topic_relevance_loss(topic_text, body2_text) * 0.9 if topic_text is not None else 0.0
        l_rep = repetition_overlap_loss(body1_text, body2_text) if body1_text is not None else 0.0
        l_ngram = ngram_overlap_loss(body2_text) if body2_text is not None else 0.0
        l_arg = argument_distance_loss(body1_text, body2_text) if body1_text is not None else 0.0
        l_sem = semantic_similarity_loss(intros, body2_text) if intros is not None and body2_text is not None else 0.0
        l_gen_b2 = argument_presence_loss(body2_text) if body2_text is not None else 0.0
        l_gen_b1 = argument_presence_loss(body1_text) if body1_text is not None else 0.0
        return base + 0.2 * l_lex + 0.7 * l_contra + 0.9 * l_topic + 0.5 * l_rep + 0.4 * l_ngram + 0.4 * l_arg + 0.4 * l_sem + 0.5 * l_gen_b2 + 0.3 * l_gen_b1

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.get("labels")
            intros = inputs.get("intro")
            body1_text = inputs.get("body1_text")
            body2_text = inputs.get("body2_text")
            topic_text = inputs.get("topic")
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels)
            loss = dpo_loss(outputs.logits, labels, intros, body1_text, body2_text, topic_text)
            return (loss, outputs) if return_outputs else loss

    args = TrainingArguments(
        output_dir=save_dir,
        report_to=["wandb"],
        run_name=f"{field}_v24",
        logging_dir=f"{save_dir}/logs",
        logging_steps=10,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-4,
        weight_decay=0.01,
        warmup_steps=100,
        num_train_epochs=4,
        fp16=True
    )

    trainer = CustomTrainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        compute_metrics=lambda p: compute_metrics(p,lam = 0.5)
    )

    trainer.train()
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"✅ Saved model to: {save_dir}", flush=True)

# === Train all paragraph models for v24 ===
train_paragraph_model("intro", "./t5_intro_lora_v24", max_target_length=160)
train_paragraph_model("body1", "./t5_body1_lora_v24", max_target_length=240)
train_paragraph_model("body2", "./t5_body2_lora_v24", max_target_length=288)
train_paragraph_model("conclusion", "./t5_conclusion_lora_v24", max_target_length=96)





You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0



📊 Filtered Samples: 419


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: st124689 (binit-ait) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin



🚀 Training for: INTRO


Map:   0%|          | 0/377 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Semantic Score,Diversity Score,Avg F1,Final Score
1,3.2599,2.719442,0.2494,0.5034,0.7453,0.8719
2,2.6592,2.641587,0.2508,0.4917,0.7512,0.8829
3,2.7836,2.606685,0.2397,0.4661,0.7574,0.9141
4,2.7366,2.594213,0.241,0.4568,0.7547,0.9227


🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
✅ Saved model to: ./t5_intro_lora_v24


0,1
eval/avg_f1,▁▄█▆
eval/diversity_score,█▆▂▁
eval/final_score,▁▃▇█
eval/loss,█▄▂▁
eval/runtime,▃▁█▆
eval/samples_per_second,▅█▁▃
eval/semantic_score,▇█▁▂
eval/steps_per_second,▅█▁▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█

0,1
eval/avg_f1,0.7547
eval/diversity_score,0.4568
eval/final_score,0.9227
eval/loss,2.59421
eval/runtime,23.4873
eval/samples_per_second,1.788
eval/semantic_score,0.241
eval/steps_per_second,0.894
total_flos,934702926004224.0
train/epoch,4.0



🚀 Training for: BODY1


Map:   0%|          | 0/377 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Semantic Score,Diversity Score,Avg F1,Final Score
1,3.4603,3.342158,0.209,0.728,0.7308,0.6675
2,3.2788,3.281767,0.2165,0.7259,0.7366,0.6658
3,3.4103,3.261563,0.2192,0.7213,0.7362,0.6691
4,3.3529,3.253983,0.2166,0.7244,0.7398,0.6673


🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
✅ Saved model to: ./t5_body1_lora_v24


0,1
eval/avg_f1,▁▆▅█
eval/diversity_score,█▆▁▄
eval/final_score,▅▁█▄
eval/loss,█▃▂▁
eval/runtime,█▁▂▄
eval/samples_per_second,▁█▆▃
eval/semantic_score,▁▆█▆
eval/steps_per_second,▁█▆▃
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇████
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇█████

0,1
eval/avg_f1,0.7398
eval/diversity_score,0.7244
eval/final_score,0.6673
eval/loss,3.25398
eval/runtime,25.6383
eval/samples_per_second,1.638
eval/semantic_score,0.2166
eval/steps_per_second,0.819
total_flos,934702926004224.0
train/epoch,4.0



🚀 Training for: BODY2


Map:   0%|          | 0/377 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Semantic Score,Diversity Score,Avg F1,Final Score
1,3.4062,3.230798,0.248,0.7698,0.7263,0.6062
2,3.2856,3.181254,0.2406,0.7284,0.7276,0.6513
3,3.2824,3.161742,0.2444,0.7232,0.7272,0.6546
4,3.3957,3.150816,0.2449,0.7441,0.7284,0.6335


🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
✅ Saved model to: ./t5_body2_lora_v24


0,1
eval/avg_f1,▁▅▄█
eval/diversity_score,█▂▁▄
eval/final_score,▁██▅
eval/loss,█▄▂▁
eval/runtime,█▁▅▄
eval/samples_per_second,▁█▃▄
eval/semantic_score,█▁▅▅
eval/steps_per_second,▁█▃▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/avg_f1,0.7284
eval/diversity_score,0.7441
eval/final_score,0.6335
eval/loss,3.15082
eval/runtime,29.7689
eval/samples_per_second,1.411
eval/semantic_score,0.2449
eval/steps_per_second,0.705
total_flos,934702926004224.0
train/epoch,4.0



🚀 Training for: CONCLUSION


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Map:   0%|          | 0/377 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Semantic Score,Diversity Score,Avg F1,Final Score
1,3.1905,2.887333,0.1978,0.4602,0.7351,0.9409
2,3.2018,2.841069,0.2005,0.458,0.7404,0.9418
3,2.9138,2.824035,0.204,0.4642,0.7403,0.9338
4,3.009,2.816853,0.203,0.4632,0.7418,0.9353


🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
🧪 Starting evaluation...
✅ Saved model to: ./t5_conclusion_lora_v24
