In [None]:
# === 单元格 1: 导入、设置与路径定义 ===

import pandas as pd
import numpy as np
import os
import torch
import shutil
import gc
import joblib 
import lightgbm as lgb
from scipy.sparse import hstack, load_npz
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer, Trainer, 
    TrainingArguments
)
from peft import PeftModel
from datasets import Dataset

# 禁用 wandb
os.environ["WANDB_MODE"] = "disabled"

# 内存清理函数
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("🚀 Kaggle 推理 Notebook 启动！")

# --- 1. 竞赛数据路径 ---
COMP_DIR = "/kaggle/input/llm-classification-finetuning"
TEST_FILE = os.path.join(COMP_DIR, "test.csv")
SAMPLE_FILE = os.path.join(COMP_DIR, "sample_submission.csv")

# --- 2. 你的数据集路径 ---
# (这与你的截图 llm-proj2-team8 一致)
YOUR_DATASET_NAME = "llm-proj2-team8" 
DATASET_DIR = os.path.join("/kaggle/input", YOUR_DATASET_NAME)

# --- 3. 【【【 路径修正 】】】 ---
# (根据你的截图，所有路径都已修正)

# A. 你训练好的模型路径 (在 'output/' 子文件夹中)
OUTPUT_DIR = os.path.join(DATASET_DIR, "output")
MODEL_A_PATH = os.path.join(OUTPUT_DIR, "model_A_lgbm_ngram.txt")
MODEL_C_PATH = os.path.join(OUTPUT_DIR, "model_C_lgbm_ngram.txt")
ADAPTER_B_PATH = os.path.join(OUTPUT_DIR, "model_B_deberta_lora")
ADAPTER_D_PATH = os.path.join(OUTPUT_DIR, "model_D_roberta_lora")
ADAPTER_E_PATH = os.path.join(OUTPUT_DIR, "model_E_deberta_base_lora")

TEMP_B_PATH = os.path.join(OUTPUT_DIR, "temp_B.npy")
TEMP_D_PATH = os.path.join(OUTPUT_DIR, "temp_D.npy")
TEMP_E_PATH = os.path.join(OUTPUT_DIR, "temp_E.npy")
VECTORIZER_PATH = os.path.join(OUTPUT_DIR, "vectorizer_ngram.joblib")

# 【请确认!】我假设你上传的是 ngram 版本，如果不是，请改为 'ensemble_weights_5model.npy'
WEIGHTS_PATH = os.path.join(OUTPUT_DIR, "ensemble_weights_5model_ngram.npy") 

# B. 基础模型路径 (在 'model/' 子文件夹中)
BASE_MODEL_DIR = os.path.join(DATASET_DIR, "model")
BASE_MINILM_PATH = os.path.join(BASE_MODEL_DIR, "sentencetransformersallminilml6v2")
BASE_E5_PATH = os.path.join(BASE_MODEL_DIR, "e5-small-v2")
BASE_B_PATH = os.path.join(BASE_MODEL_DIR, "deberta-v3-small")
BASE_D_PATH = os.path.join(BASE_MODEL_DIR, "roberta-transformers-pytorch")
BASE_E_PATH = os.path.join(BASE_MODEL_DIR, "deberta-v3-base")

print("✅ 所有路径定义完毕。")

In [None]:
# === 单元格 2: 加载数据与特征工程 (LGBM A/C) ===

print(f"⏳ 正在加载 test.csv...")
test_df = pd.read_csv(TEST_FILE)
sample_df = pd.read_csv(SAMPLE_FILE)

print("⏳ 正在为 LGBM A/C 进行特征工程...")

# 3.1 基础偏置特征
def create_base_features(df):
    df['text_a'] = df['prompt'] + " " + df['response_a']
    df['text_b'] = df['prompt'] + " " + df['response_b']
    df['combined_for_embedding'] = df['text_a'] + " [SEP] " + df['text_b']
    df["resp_a_len"] = df["response_a"].str.len()
    df["resp_b_len"] = df["response_b"].str.len()
    df["len_diff"] = df["resp_a_len"] - df["resp_b_len"]
    df["len_ratio"] = df["resp_a_len"] / (df["resp_b_len"] + 1e-6)
    df["lexical_a"] = df["response_a"].apply(lambda x: len(set(str(x).split())) / (len(str(x).split()) + 1e-6))
    df["lexical_b"] = df["response_b"].apply(lambda x: len(set(str(x).split())) / (len(str(x).split()) + 1e-6))
    df["lexical_diff"] = df["lexical_a"] - df["lexical_b"]
    return df

test_df = create_base_features(test_df)
print("  ...基础偏置特征完成。")

# 3.2 嵌入和余弦相似度
print("  ...正在生成 MiniLM 嵌入 (用于 A 和 4号特征)")
model_minilm = SentenceTransformer(BASE_MINILM_PATH, device='cuda')
test_emb_minilm = model_minilm.encode(
    test_df['combined_for_embedding'].tolist(), 
    show_progress_bar=True, batch_size=128, convert_to_numpy=True
)
resp_a_emb_test = model_minilm.encode(test_df['response_a'].tolist(), batch_size=128)
resp_b_emb_test = model_minilm.encode(test_df['response_b'].tolist(), batch_size=128)
test_df['cosine_similarity'] = np.array([
    cosine_similarity(resp_a_emb_test[i].reshape(1, -1), resp_b_emb_test[i].reshape(1, -1))[0][0] 
    for i in range(len(resp_a_emb_test))
])
del model_minilm, resp_a_emb_test, resp_b_emb_test; clear_memory()
print("  ...MiniLM 嵌入完成。")

print("  ...正在生成 E5 嵌入 (用于 C)")
model_e5 = SentenceTransformer(BASE_E5_PATH, device='cuda')
test_emb_e5 = model_e5.encode(
    test_df["combined_for_embedding"].tolist(), 
    batch_size=128, show_progress_bar=True, convert_to_numpy=True
)
del model_e5; clear_memory()
print("  ...E5 嵌入完成。")

# 3.3 N-gram 特征
print("  ...正在生成 N-gram 差异特征")
vectorizer = joblib.load(VECTORIZER_PATH)
test_ngram_a = vectorizer.transform(test_df['response_a'].astype(str))
test_ngram_b = vectorizer.transform(test_df['response_b'].astype(str))
test_ngram_diff = (test_ngram_a - test_ngram_b)
print("  ...N-gram 特征完成。")

# 3.4 堆叠特征
all_4_features_test = test_df[["len_diff", "len_ratio", "lexical_diff", "cosine_similarity"]].fillna(0).values

X_test_A = hstack([test_emb_minilm, all_4_features_test, test_ngram_diff]).tocsr()
X_test_C = hstack([test_emb_e5, all_4_features_test, test_ngram_diff]).tocsr()
print("  ...特征堆叠完成。")

del test_emb_minilm, test_emb_e5, all_4_features_test, test_ngram_diff; clear_memory()
print("✅ LGBM A/C 特征工程完毕。")

In [None]:
# === 单元格 3: LGBM A/C 推理 ===

print("⏳ 正在使用 LGBM A/C 进行推理...")
lgbm_model_A = lgb.Booster(model_file=MODEL_A_PATH)
preds_A = lgbm_model_A.predict(X_test_A)

lgbm_model_C = lgb.Booster(model_file=MODEL_C_PATH)
preds_C = lgbm_model_C.predict(X_test_C)

del X_test_A, X_test_C, lgbm_model_A, lgbm_model_C; clear_memory()
print("✅ LGBM A/C 推理完毕。")

In [None]:
# === 单元格 4: Transformer B/D/E 推理 ===

print("⏳ 正在为 Transformers B/D/E 准备数据...")
test_dataset = Dataset.from_pandas(test_df[['id', 'prompt', 'response_a', 'response_b']])

training_args = TrainingArguments(
    output_dir="./infer_results",
    per_device_eval_batch_size=16,
    dataloader_num_workers=0,
    fp16=True,
    fp16_full_eval=True,
    report_to=[]
)

# --- 模型 B (DeBERTa-small) ---
print("  ...正在使用 LoRA B (DeBERTa-small) 进行推理")
tokenizer_B = AutoTokenizer.from_pretrained(BASE_B_PATH, local_files_only=True)
def preprocess_function_B(examples):
    texts = [f"问题: {p} [SEP] A: {a} [SEP] B: {b}" for p, a, b in zip(examples["prompt"], examples["response_a"], examples["response_b"])]
    return tokenizer_B(texts, truncation=True, padding="max_length", max_length=256)
tokenized_test_B = test_dataset.map(preprocess_function_B, batched=True, desc="Tokenizing B")

base_model_B = AutoModelForSequenceClassification.from_pretrained(BASE_B_PATH, num_labels=3, local_files_only=True)
model_B = PeftModel.from_pretrained(base_model_B, ADAPTER_B_PATH)
trainer_B = Trainer(model=model_B, args=training_args, tokenizer=tokenizer_B)
logits_B = trainer_B.predict(tokenized_test_B).predictions
del base_model_B, model_B, trainer_B, tokenizer_B; clear_memory()
print("  ...模型 B 推理完毕。")

# --- 模型 D (RoBERTa-base) ---
print("  ...正在使用 LoRA D (RoBERTa-base) 进行推理")
tokenizer_D = AutoTokenizer.from_pretrained(BASE_D_PATH, local_files_only=True)
def preprocess_function_D(examples):
    texts = [f"问题: {p} [SEP] A: {a} [SEP] B: {b}" for p, a, b in zip(examples["prompt"], examples["response_a"], examples["response_b"])]
    return tokenizer_D(texts, truncation=True, padding="max_length", max_length=256)
tokenized_test_D = test_dataset.map(preprocess_function_D, batched=True, desc="Tokenizing D")

base_model_D = AutoModelForSequenceClassification.from_pretrained(BASE_D_PATH, num_labels=3, local_files_only=True)
model_D = PeftModel.from_pretrained(base_model_D, ADAPTER_D_PATH)
trainer_D = Trainer(model=model_D, args=training_args, tokenizer=tokenizer_D)
logits_D = trainer_D.predict(tokenized_test_D).predictions
del base_model_D, model_D, trainer_D, tokenizer_D; clear_memory()
print("  ...模型 D 推理完毕。")

# --- 模型 E (DeBERTa-base) ---
print("  ...正在使用 LoRA E (DeBERTa-base) 进行推理")
tokenizer_E = AutoTokenizer.from_pretrained(BASE_E_PATH, local_files_only=True)
def preprocess_function_E(examples):
    texts = [f"问题: {p} [SEP] A: {a} [SEP] B: {b}" for p, a, b in zip(examples["prompt"], examples["response_a"], examples["response_b"])]
    return tokenizer_E(texts, truncation=True, padding="max_length", max_length=256)
tokenized_test_E = test_dataset.map(preprocess_function_E, batched=True, desc="Tokenizing E")

base_model_E = AutoModelForSequenceClassification.from_pretrained(BASE_E_PATH, num_labels=3, local_files_only=True)
model_E = PeftModel.from_pretrained(base_model_E, ADAPTER_E_PATH)
trainer_E = Trainer(model=model_E, args=training_args, tokenizer=tokenizer_E)
logits_E = trainer_E.predict(tokenized_test_E).predictions
del base_model_E, model_E, trainer_E, tokenizer_E; clear_memory()
print("  ...模型 E 推理完毕。")

print("✅ Transformer B/D/E 推理完毕。")

In [None]:
# === 单元格 5: 校准、集成并生成 submission.csv ===

print("⏳ 正在校准和集成所有 5 个模型...")

def temperature_scale(logits, T):
    logits_T = logits / T
    return torch.softmax(torch.tensor(logits_T), dim=-1).numpy()

# 加载温度
T_B = np.load(TEMP_B_PATH)[0]
T_D = np.load(TEMP_D_PATH)[0]
T_E = np.load(TEMP_E_PATH)[0]
print(f"  ...已加载温度: T_B={T_B:.3f}, T_D={T_D:.3f}, T_E={T_E:.3f}")

# 校准
preds_B = temperature_scale(logits_B, T_B)
preds_D = temperature_scale(logits_D, T_D)
preds_E = temperature_scale(logits_E, T_E)
print("  ...B, D, E 校准完毕。")

# 加载权重
W = np.load(WEIGHTS_PATH) # [wA, wB, wC, wD, wE]
print(f"  ...已加载 N-gram 5 模型权重:")
print(f"    A-Ng: {W[0]:.4f}, B: {W[1]:.4f}, C-Ng: {W[2]:.4f}, D: {W[3]:.4f}, E: {W[4]:.4f}")

# 最终集成 (preds_A 和 preds_C 已经是概率，无需校准)
final_preds = (
    (preds_A * W[0]) + (preds_B * W[1]) +
    (preds_C * W[2]) + (preds_D * W[3]) +
    (preds_E * W[4])
)
print("  ...集成完毕。")

# === 7. 创建提交文件 ===
print("⏳ 正在创建 submission.csv...")
submission_df = pd.DataFrame(final_preds, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
submission_df['id'] = test_df['id']
submission_df = submission_df[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]

# 确保概率总和为 1 (SLSQP 优化器会保证总和为1，但保险起见)
submission_df.iloc[:, 1:] = submission_df.iloc[:, 1:].div(submission_df.iloc[:, 1:].sum(axis=1), axis=0)
# 裁剪以避免 LogLoss 错误
submission_df.iloc[:, 1:] = np.clip(submission_df.iloc[:, 1:], 1e-7, 1 - 1e-7)

submission_df.to_csv('submission.csv', index=False)

print("✅ 提交文件创建成功!")
print(submission_df.head())