In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
import nltk
from nltk.corpus import wordnet
import random
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# ========== Step 1. 加载数据 ==========
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")
print(df_train.columns)
print(df_train.head())

Index(['Question', 'label'], dtype='object')
                                            Question  label
0  A solitaire game is played as follows.  Six di...      3
1  2. The school table tennis championship was he...      5
2  Given that $x, y,$ and $z$ are real numbers th...      0
3  $25 \cdot 22$ Given three distinct points $P\l...      1
4  I am thinking of a five-digit number composed ...      5


In [None]:
import random
import pandas as pd
import torch
from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ✅ 载入更快的模型 (支持 GPU + FP16)
model_name = "eugenesiow/bart-paraphrase"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).half().cuda()
model.eval()

# ✅ 同义词替换函数（不变）
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for word in random_word_list:
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace("_", " ").lower()
                if synonym != word and synonym.isalpha():
                    synonyms.add(synonym)
        if synonyms:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if w == word else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# ✅ 批量 paraphrase 函数
def batch_paraphrase(texts, max_length=128):
    prompts = [f"paraphrase: {t}" for t in texts]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=5,
            num_return_sequences=1,
            do_sample=False
        )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# ✅ 数据增强流程
def augment_dataframe(df, batch_size=16):
    augmented_rows = []

    # 同义词替换部分
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🔁 同义词增强"):
        q = row["Question"]
        label = row["label"]
        syn_q = synonym_replacement(q, n=2)
        augmented_rows.append({"Question": syn_q, "label": label})

    # 语义改写部分（batch）
    for i in tqdm(range(0, len(df), batch_size), desc="🔄 语义改写增强"):
        batch_df = df.iloc[i:i+batch_size]
        questions = batch_df["Question"].tolist()
        labels = batch_df["label"].tolist()
        try:
            paraphrased = batch_paraphrase(questions)
        except Exception as e:
            print(f"paraphrasing batch failed at [{i}-{i+batch_size}]: {e}")
            paraphrased = questions  # fallback

        for new_q, label in zip(paraphrased, labels):
            augmented_rows.append({"Question": new_q, "label": label})

    df_aug = pd.DataFrame(augmented_rows)
    df_combined = pd.concat([df, df_aug], ignore_index=True)
    return df_combined

# ✅ 用法示例
df = df_train
df_augmented = augment_dataframe(df, batch_size=16)
df_augmented.to_csv("augmented_math_questions.csv", index=False, encoding="utf-8")


tokenizer_config.json:   0%|          | 0.00/332 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

🔁 同义词增强: 100%|██████████| 10189/10189 [00:01<00:00, 5869.72it/s]
🔄 语义改写增强: 100%|██████████| 637/637 [36:37<00:00,  3.45s/it]


In [None]:
import pandas as pd

# 🔁 读取你之前保存的增强数据文件（CSV 或 TSV）
# 如果是 CSV，使用 encoding="utf-8" 或适配你保存时的编码
df = pd.read_csv("augmented_math_questions.csv")  # 或你的具体文件路径

# ✅ 去除前缀 "Paraphrase:"（不区分大小写）
df["Question"] = df["Question"].str.replace(r"(?i)^paraphrase:\s*", "", regex=True).str.strip()

# 💾 保存清理后的数据
df.to_csv("augmented_math_questions_cleaned.csv", index=False, encoding="utf-8")

print("✅ 清理完成，保存至 augmented_math_questions_cleaned.csv")


✅ 清理完成，保存至 augmented_math_questions_cleaned.csv


In [None]:
# ========== Step 2. 模型参数 ==========
MODEL_NAME = "microsoft/deberta-v3-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32
EPOCHS = 30
PATIENCE = 3
MAX_LEN = 128
FOLDS = 5
LAMBDA = 0.1

In [None]:
from huggingface_hub import login

# 替换为你的 Huggingface Token
login("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")


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class MathDataset(Dataset):
    def __init__(self, questions, labels=None):
        self.questions = ["Classify the topic of this math problem: " + q for q in questions]
        self.labels = labels

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        enc = tokenizer(self.questions[idx], padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in enc.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item



In [None]:
class MathClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_NAME)
        hidden_size = self.backbone.config.hidden_size
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, 8)
        )

    def forward(self, input_ids, attention_mask):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]  # [CLS] token
        logits = self.fc(cls)
        return logits, cls  # ✅ 同时返回 logits 和 features


In [None]:
df.shape

(30567, 2)

In [None]:
# ========== Step 5. Hold-out + 训练 ==========

all_questions = df["Question"].tolist()
all_labels = df["label"].tolist()
all_dataset = MathDataset(all_questions, all_labels)

# 简单划分 90% 训练集 + 10% 验证集
train_size = int(0.9 * len(all_dataset))
val_size = len(all_dataset) - train_size
train_ds, val_ds = torch.utils.data.random_split(all_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

model = MathClassifier().to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
loss_cls = nn.CrossEntropyLoss(label_smoothing=0.1)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=EPOCHS * len(train_loader))

def supervised_contrastive_loss(features, labels):
    features = nn.functional.normalize(features, dim=1)
    labels = labels.contiguous().view(-1, 1)
    mask = torch.eq(labels, labels.T).float().to(DEVICE)
    logits = torch.matmul(features, features.T) / 0.07
    logits_mask = torch.ones_like(mask) - torch.eye(mask.size(0)).to(DEVICE)
    mask = mask * logits_mask
    exp_logits = torch.exp(logits) * logits_mask
    log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True) + 1e-12)
    mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-12)
    return -mean_log_prob_pos.mean()

best_acc = 0
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss, preds, trues = 0, [], []
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Train"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        logits, features = model(input_ids, attention_mask)
        loss = loss_cls(logits, labels) + LAMBDA * supervised_contrastive_loss(features, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds.extend(logits.argmax(dim=1).cpu().tolist())
        trues.extend(labels.cpu().tolist())

    train_acc = accuracy_score(trues, preds)
    print(f"Epoch {epoch+1} Train Loss: {total_loss:.4f}, Acc: {train_acc:.4f}")

    # 验证
    model.eval()
    val_preds, val_trues = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits, _ = model(input_ids, attention_mask)
            val_preds.extend(logits.argmax(dim=1).cpu().tolist())
            val_trues.extend(labels.cpu().tolist())

    val_acc = accuracy_score(val_trues, val_preds)
    print(f"Epoch {epoch+1} Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), f"best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping triggered.")
            break


Epoch 1 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 1 Train Loss: 1199.2723, Acc: 0.7092
Epoch 1 Val Acc: 0.8413


Epoch 2 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 2 Train Loss: 856.8987, Acc: 0.8770
Epoch 2 Val Acc: 0.8947


Epoch 3 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 3 Train Loss: 734.9760, Acc: 0.9260
Epoch 3 Val Acc: 0.9339


Epoch 4 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 4 Train Loss: 665.4672, Acc: 0.9552
Epoch 4 Val Acc: 0.9467


Epoch 5 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 5 Train Loss: 633.7081, Acc: 0.9674
Epoch 5 Val Acc: 0.9522


Epoch 6 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 6 Train Loss: 612.2776, Acc: 0.9758
Epoch 6 Val Acc: 0.9552


Epoch 7 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 7 Train Loss: 593.4011, Acc: 0.9827
Epoch 7 Val Acc: 0.9627


Epoch 8 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 8 Train Loss: 588.5134, Acc: 0.9850
Epoch 8 Val Acc: 0.9598


Epoch 9 Train: 100%|██████████| 860/860 [03:14<00:00,  4.41it/s]


Epoch 9 Train Loss: 579.4239, Acc: 0.9883
Epoch 9 Val Acc: 0.9702


Epoch 10 Train: 100%|██████████| 860/860 [03:14<00:00,  4.41it/s]


Epoch 10 Train Loss: 572.7329, Acc: 0.9904
Epoch 10 Val Acc: 0.9679


Epoch 11 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 11 Train Loss: 569.3593, Acc: 0.9923
Epoch 11 Val Acc: 0.9738


Epoch 12 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 12 Train Loss: 565.1066, Acc: 0.9935
Epoch 12 Val Acc: 0.9621


Epoch 13 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 13 Train Loss: 564.3388, Acc: 0.9935
Epoch 13 Val Acc: 0.9660


Epoch 14 Train: 100%|██████████| 860/860 [03:15<00:00,  4.41it/s]


Epoch 14 Train Loss: 561.5693, Acc: 0.9944
Epoch 14 Val Acc: 0.9728
Early stopping triggered.


In [None]:
# ========== Step 8. 最终预测 ==========
print("🧠 Step 8: 使用增强模型预测 test.csv...")
model.load_state_dict(torch.load("/content/best_model.pt"))
model.eval()

test_dataset = MathDataset(df_test['Question'].tolist())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

final_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask)
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs

        final_preds.extend(logits.argmax(dim=1).cpu().tolist())

submission = pd.DataFrame({"id": df_test.index, "label": final_preds})
submission.to_csv("submission.csv", index=False)
print("✅ 提交文件已生成 submission.csv")

🧠 Step 8: 使用增强模型预测 test.csv...


Predicting: 100%|██████████| 96/96 [00:07<00:00, 12.07it/s]

✅ 提交文件已生成 submission.csv





In [None]:
# ========== Step 6. 使用 best_mathbert.pt 生成伪标签 ==========
print("🔍 Step 6: 使用 best_mathbert.pt 生成伪标签...")
model.load_state_dict(torch.load("/content/best_model.pt", map_location=DEVICE))
model.eval()

pseudo_dataset = MathDataset(df_test['Question'].tolist(), labels=None)  # 明确 labels=None
pseudo_loader = DataLoader(pseudo_dataset, batch_size=BATCH_SIZE)

final_preds = []
probs = []

with torch.no_grad():
    for batch in tqdm(pseudo_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        logits, _ = model(input_ids, attention_mask)  # 解包 tuple 输出
        prob = torch.softmax(logits, dim=1)
        probs.extend(prob.cpu().tolist())
        final_preds.extend(prob.argmax(dim=1).cpu().tolist())

df_test["label"] = final_preds
df_test["max_prob"] = [np.max(p) for p in probs]
filtered = df_test[df_test["max_prob"] > 0.95]

aug_df = pd.concat([df, filtered[["Question", "label"]]], ignore_index=True)
print("✅ 伪标签生成完毕，增强后的训练集样本数:", len(aug_df))


🔍 Step 6: 使用 best_mathbert.pt 生成伪标签...


Predicting:   0%|          | 0/191 [00:00<?, ?it/s]

✅ 伪标签生成完毕，增强后的训练集样本数: 20400


In [None]:
filtered.shape

(22, 4)

In [None]:
# ========== Step 7. 增强训练集再次训练 ==========
print("💪 Step 7: 使用伪标签增强重新训练...")

dataset = MathDataset(aug_df["Question"].tolist(), aug_df["label"].tolist())
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)


model.load_state_dict(torch.load("/content/best_model.pt"))
loss_cls = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=EPOCHS * len(train_loader))

best_acc = 0
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss, preds, trues = 0, [], []
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Train"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        logits, features = model(input_ids, attention_mask)
        loss = loss_cls(logits, labels) + LAMBDA * supervised_contrastive_loss(features, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds.extend(logits.argmax(dim=1).cpu().tolist())
        trues.extend(labels.cpu().tolist())

    train_acc = accuracy_score(trues, preds)
    print(f"Epoch {epoch+1} Train Loss: {total_loss:.4f}, Acc: {train_acc:.4f}")

    # 验证
    model.eval()
    val_preds, val_trues = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits, _ = model(input_ids, attention_mask)
            val_preds.extend(logits.argmax(dim=1).cpu().tolist())
            val_trues.extend(labels.cpu().tolist())

    val_acc = accuracy_score(val_trues, val_preds)
    print(f"Epoch {epoch+1} Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), f"best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping triggered.")
            break


💪 Step 7: 使用伪标签增强重新训练...


Epoch 1 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 1 Train Loss: 745.9100, Acc: 0.9724
Epoch 1 Val Acc: 0.9794


Epoch 2 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 2 Train Loss: 717.0124, Acc: 0.9797
Epoch 2 Val Acc: 0.9770


Epoch 3 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 3 Train Loss: 708.3229, Acc: 0.9829
Epoch 3 Val Acc: 0.9863


Epoch 4 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 4 Train Loss: 705.8056, Acc: 0.9832
Epoch 4 Val Acc: 0.9882


Epoch 5 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 5 Train Loss: 691.6933, Acc: 0.9879
Epoch 5 Val Acc: 0.9833


Epoch 6 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 6 Train Loss: 689.0345, Acc: 0.9885
Epoch 6 Val Acc: 0.9877


Epoch 7 Train:   0%|          | 0/1148 [00:00<?, ?it/s]

Epoch 7 Train Loss: 677.4004, Acc: 0.9917
Epoch 7 Val Acc: 0.9863
Early stopping triggered.


In [None]:
# ========== Step 8. 最终预测 ==========
print("🧠 Step 8: 使用增强模型预测 test.csv...")
model.load_state_dict(torch.load("/content/best_model.pt"))
model.eval()

test_dataset = MathDataset(df_test['Question'].tolist())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

final_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask)
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs

        final_preds.extend(logits.argmax(dim=1).cpu().tolist())

submission = pd.DataFrame({"id": df_test.index, "label": final_preds})
submission.to_csv("submission.csv", index=False)
print("✅ 提交文件已生成 submission.csv")

🧠 Step 8: 使用增强模型预测 test.csv...


Predicting:   0%|          | 0/191 [00:00<?, ?it/s]

✅ 提交文件已生成 submission.csv
