In [None]:
!pip install torch transformers datasets scikit-learn scipy requests

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time
import os
from scipy.stats import spearmanr
import requests
import zipfile

# -------------------- 1. 环境准备 --------------------
print("----- 1. 环境准备 -----")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# -------------------- 2. 加载预训练 RWKV 模型和 Tokenizer --------------------
print("----- 2. 加载预训练 RWKV 模型和 Tokenizer -----")
model_name = "RWKV/v6-Finch-1B6-HF"
tokenizer_name = "gpt2"

try:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
    model.to(device)
    model.eval()
    print("RWKV 模型和 Tokenizer 使用 Transformers 库加载成功！")

except Exception as e:
    print(f"Transformers 库加载 RWKV 模型失败: {e}")
    print("请检查 RWKV 模型是否需要使用官方库加载，并根据 RWKV 官方库的文档修改代码。")
    print("Tokenizer 可能也需要根据 RWKV 模型进行调整。")
    raise e

# -------------------- 3. 数据集准备 --------------------
print("----- 3. 数据集准备 -----")
dataset_name = 'glue'
dataset_config_name = 'mrpc'
dataset = load_dataset(dataset_name, dataset_config_name)
train_dataset = dataset['train'].select(range(1000))
validation_dataset = dataset['validation']

print(f"数据集 {dataset_name}/{dataset_config_name} 加载完成！")
print(f"训练集大小: {len(train_dataset)}, 验证集大小: {len(validation_dataset)}")

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding='max_length', max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)

print("数据集 Tokenization 完成！")

# -------------------- 4. RWKV 句子嵌入函数 (支持不同层) --------------------
print("----- 4. RWKV 句子嵌入函数 (支持不同层) -----")
def get_rwkv_embedding(text, model, tokenizer, device="cpu", layer_index='last'): # Changed 'layer' to 'layer_index' and default to 'last'
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states

        if layer_index == 'last': # Keep 'last' option for comparison
            layer_hidden_state = hidden_states[-1]
        elif layer_index == 'average': # Keep 'average' option for comparison
            layer_hidden_state = torch.stack(hidden_states[1:]).mean(dim=0)
        elif layer_index == 'concat': # Keep 'concat' option for comparison
            layer_hidden_state = torch.cat(hidden_states[1:], dim=-1)
        elif isinstance(layer_index, int): # Add integer index option
            layer_hidden_state = hidden_states[layer_index] # Extract hidden state from specific layer index
        else: # Default to last layer if layer_index is not recognized
            layer_hidden_state = hidden_states[-1]

        sentence_embedding = torch.mean(layer_hidden_state, dim=1).squeeze()
    return sentence_embedding.cpu().numpy()

print("RWKV 句子嵌入函数定义完成 (支持不同层)！")

# -------------------- 5. 基线句子嵌入函数 (平均 GloVe 词向量) - 手动加载 --------------------
print("----- 5. 基线句子嵌入函数 (平均 GloVe 词向量) -----")
glove_embeddings = {}
glove_file_path = "glove.6B.50d.txt" #  请确保 glove.6B.50d.txt 文件已上传到 Colab 环境

print(f"尝试加载 GloVe 词向量: {glove_file_path}")
try:
    with open(glove_file_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove_embeddings[word] = vector
    print(f"GloVe 词向量加载完成，词汇量: {len(glove_embeddings)}")
except FileNotFoundError:
    print(f"GloVe 词向量文件未找到: {glove_file_path}. 请手动下载 glove.6B.50d.txt 并上传到 Colab 环境。基线方法将无法正常工作。")
    glove_embeddings = {}
except Exception as e:
    print(f"加载 GloVe 词向量文件失败: {e}")
    glove_embeddings = {}


def get_baseline_embedding(text, glove_embeddings):
    tokens = text.lower().split()
    word_vectors = [glove_embeddings.get(token, np.zeros(50)) for token in tokens]
    if not word_vectors:
        return np.zeros(50)
    sentence_embedding = np.mean(word_vectors, axis=0)
    return sentence_embedding

print("基线句子嵌入函数定义完成！")

# -------------------- 6. 语义相似度计算函数 --------------------
print("----- 6. 语义相似度计算函数 -----")
def calculate_similarity(embedding1, embedding2):
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity_score

print("余弦相似度计算函数定义完成！")

# -------------------- 7. 实验执行与评估 (训练集) - Layer Exploration --------------------
print("----- 7. 实验执行与评估 (训练集) - Layer Exploration -----")

rwkv_spearman_train_layers = {}
rwkv_inference_times_train_layers = {}
gpu_memory_rwkv_train_layers = {}
layers_to_test = [1, 3, 5, 7, 9, 11]

baseline_similarities_train = []
labels_train = [] # Initialize labels_train here, outside layer loop
baseline_inference_times_train = []
gpu_memory_baseline_train = []


# --- Collect Labels for Training Set (ONCE, before layer loop) ---
for i in range(len(tokenized_train_dataset)): # Loop to collect labels
    example = tokenized_train_dataset[i]
    label = example['label']
    labels_train.append(label) # Append labels here, outside layer loop


for layer_index in layers_to_test:
    print(f"--- Evaluating RWKV Layer: {layer_index} ---")
    rwkv_similarities_train_layer = [] # Reset similarities for each layer
    rwkv_inference_times_train_layer = []
    gpu_memory_rwkv_train_layer = []

    start_time_rwkv_train_layer = time.time()
    for i in range(len(tokenized_train_dataset)):
        example = tokenized_train_dataset[i]
        sentence1 = train_dataset[i]['sentence1']
        sentence2 = train_dataset[i]['sentence2']


        #  RWKV - Specific Layer
        torch.cuda.reset_peak_memory_stats(0)
        start_inference_rwkv = time.time()
        rwkv_embedding1_layer = get_rwkv_embedding(sentence1, model, tokenizer, device, layer_index=layer_index)
        rwkv_embedding2_layer = get_rwkv_embedding(sentence2, model, tokenizer, device, layer_index=layer_index)
        rwkv_similarity_layer = calculate_similarity(rwkv_embedding1_layer, rwkv_embedding2_layer)
        end_inference_rwkv = time.time()
        rwkv_similarities_train_layer.append(rwkv_similarity_layer) # Append similarity for current layer
        rwkv_inference_times_train_layer.append(end_inference_rwkv - start_inference_rwkv)
        gpu_memory_rwkv_train_layer.append(torch.cuda.max_memory_allocated(0) / 1024**2)
        # labels_train.append(label) # REMOVE this line - labels are collected outside layer loop

    end_time_rwkv_train_layer = time.time()
    total_rwkv_time_train_layer = end_time_rwkv_train_layer - start_time_rwkv_train_layer

    # Calculate Spearman Correlation for this layer
    spearman_layer_train, _ = spearmanr(rwkv_similarities_train_layer, labels_train)
    rwkv_spearman_train_layers[layer_index] = spearman_layer_train
    rwkv_inference_times_train_layers[layer_index] = np.mean(rwkv_inference_times_train_layer)
    gpu_memory_rwkv_train_layers[layer_index] = np.mean(gpu_memory_rwkv_train_layer)

    print(f"RWKV (Layer {layer_index}) 训练集推理时间: {total_rwkv_time_train_layer:.4f} 秒")
    print(f"Spearman 相关系数 (RWKV - Layer {layer_index}, 训练集): {spearman_layer_train:.4f}")


start_time_baseline_train = time.time()
for i in range(len(tokenized_train_dataset)):
    example = tokenized_train_dataset[i]
    sentence1 = train_dataset[i]['sentence1']
    sentence2 = train_dataset[i]['sentence2']
    label = example['label']
    torch.cuda.reset_peak_memory_stats(0)
    start_inference_baseline = time.time()
    baseline_embedding1 = get_baseline_embedding(sentence1, glove_embeddings)
    baseline_embedding2 = get_baseline_embedding(sentence2, glove_embeddings)
    baseline_similarity = calculate_similarity(baseline_embedding1, baseline_embedding2)
    end_inference_baseline = time.time()
    baseline_similarities_train.append(baseline_similarity)
    baseline_inference_times_train.append(end_inference_baseline - start_inference_baseline)
    gpu_memory_baseline_train.append(torch.cuda.max_memory_allocated(0) / 1024**2)
end_time_baseline_train = time.time()
total_baseline_time_train = end_time_baseline_train - start_time_baseline_train

print(f"Baseline 训练集推理时间: {total_baseline_time_train:.4f} 秒")


# ----- 7. 实验执行与评估 (验证集) - Layer Exploration -----
print("----- 7. 实验执行与评估 (验证集) - Layer Exploration -----")

rwkv_spearman_val_layers = {}
rwkv_inference_times_val_layers = {}
gpu_memory_rwkv_val_layers = {}


baseline_similarities_val = []
labels_val = [] # Initialize labels_val here, outside layer loop
baseline_inference_times_val = []
gpu_memory_baseline_val = []

# --- Collect Labels for Validation Set (ONCE, before layer loop) ---
for i in range(len(tokenized_validation_dataset)): # Loop to collect validation labels
    example = tokenized_validation_dataset[i]
    label = example['label']
    labels_val.append(label) # Append validation labels here, outside layer loop


start_time_rwkv_val = time.time()
for layer_index in layers_to_test:
    print(f"--- Evaluating RWKV Layer: {layer_index} ---")
    rwkv_similarities_val_layer = [] # Reset similarities for each layer
    rwkv_inference_times_val_layer = []
    gpu_memory_rwkv_val_layer = []

    start_time_rwkv_val_layer = time.time()
    for i in range(len(tokenized_validation_dataset)):
        example = tokenized_validation_dataset[i]
        sentence1 = validation_dataset[i]['sentence1']
        sentence2 = validation_dataset[i]['sentence2']
        label = example['label']

        # RWKV - Specific Layer
        torch.cuda.reset_peak_memory_stats(0)
        start_inference_val = time.time()
        rwkv_embedding1_layer = get_rwkv_embedding(sentence1, model, tokenizer, device, layer_index=layer_index)
        rwkv_embedding2_layer = get_rwkv_embedding(sentence2, model, tokenizer, device, layer_index=layer_index)
        rwkv_similarity_layer = calculate_similarity(rwkv_embedding1_layer, rwkv_embedding2_layer)
        end_inference_rwkv = time.time()
        rwkv_similarities_val_layer.append(rwkv_similarity_layer) # Append similarity for current layer
        rwkv_inference_times_val_layer.append(end_inference_rwkv - start_inference_val)
        gpu_memory_rwkv_val_layer.append(torch.cuda.max_memory_allocated(0) / 1024**2)
        # labels_val.append(label) # REMOVE this line - labels are collected outside layer loop


    end_time_rwkv_val_layer = time.time()
    total_rwkv_time_val_layer = end_time_rwkv_val_layer - start_time_rwkv_val

    # Calculate Spearman Correlation for this layer
    spearman_layer_val, _ = spearmanr(rwkv_similarities_val_layer, labels_val)
    rwkv_spearman_val_layers[layer_index] = spearman_layer_val
    rwkv_inference_times_val_layers[layer_index] = np.mean(rwkv_inference_times_val_layer)
    gpu_memory_rwkv_val_layers[layer_index] = np.mean(gpu_memory_rwkv_val_layer)

    print(f"RWKV (Layer {layer_index}) 验证集推理时间: {total_rwkv_time_val_layer:.4f} 秒")
    print(f"Spearman 相关系数 (RWKV - Layer {layer_index}, 验证集): {spearman_layer_val:.4f}")


start_time_baseline_val = time.time()
for i in range(len(tokenized_validation_dataset)):
    example = tokenized_validation_dataset[i]
    sentence1 = validation_dataset[i]['sentence1']
    sentence2 = validation_dataset[i]['sentence2']
    baseline_embedding1 = get_baseline_embedding(sentence1, glove_embeddings)
    baseline_embedding2 = get_baseline_embedding(sentence2, glove_embeddings)
    baseline_similarity = calculate_similarity(baseline_embedding1, baseline_embedding2)
    end_time_baseline_val = time.time()
    baseline_similarities_val.append(baseline_similarity)
    baseline_inference_times_val.append(end_time_baseline_val - start_time_baseline_val)
    gpu_memory_baseline_val.append(torch.cuda.max_memory_allocated(0) / 1024**2)
end_time_baseline_val = time.time()
total_baseline_time_val = end_time_baseline_val - start_time_baseline_val


print(f"Baseline 验证集推理时间: {total_baseline_time_val:.4f} 秒")


# -------------------- 8. 结果分析与展示 --------------------
print("----- 8. 结果分析与展示 -----")

print("\n----- 实验结果 (定量评估 - Spearman 相关系数) - Layer Exploration -----")
print("--- Training Set ---")
for layer_index in layers_to_test:
    spearman_train = rwkv_spearman_train_layers[layer_index]
    inference_time_train = rwkv_inference_times_train_layers[layer_index]
    gpu_memory_train = gpu_memory_rwkv_train_layers[layer_index]
    print(f"Spearman 相关系数 (RWKV - Layer {layer_index}, 训练集): {spearman_train:.4f}, Avg Inference Time: {inference_time_train:.4f}s, Avg GPU Memory: {gpu_memory_train:.2f}MB")

print("--- Validation Set ---")
for layer_index in layers_to_test:
    spearman_val = rwkv_spearman_val_layers[layer_index]
    inference_time_val = rwkv_inference_times_val_layers[layer_index]
    gpu_memory_val = gpu_memory_rwkv_val_layers[layer_index]
    print(f"Spearman 相关系数 (RWKV - Layer {layer_index}, 验证集): {spearman_val:.4f}, Avg Inference Time: {inference_time_val:.4f}s, Avg GPU Memory: {gpu_memory_val:.2f}MB")


print("\n----- 实验结果 (定量评估 - Spearman 相关系数) - Baseline -----") # Added Baseline Spearman results
# Spearman 相关系数 - Baseline
spearman_baseline_train, _ = spearmanr(baseline_similarities_train, labels_train)
spearman_baseline_val, _ = spearmanr(baseline_similarities_val, labels_val)
print(f"Spearman 相关系数 (Baseline, 训练集): {spearman_baseline_train:.4f}")
print(f"Spearman 相关系数 (Baseline, 验证集): {spearman_baseline_val:.4f}")


print("\n----- 实验结果 (推理时间) -----")
print(f"Baseline 训练集平均推理时间 (每句): {np.mean(baseline_inference_times_train):.4f} 秒")
print(f"Baseline 验证集平均推理时间 (每句): {np.mean(baseline_inference_times_val):.4f} 秒")


print("\n----- 实验结果 (GPU 内存占用 - 峰值) -----")
print(f"Baseline 训练集峰值 GPU 内存占用 (每句): {np.mean(gpu_memory_baseline_train):.2f} MB")
print(f"Baseline 验证集峰值 GPU 内存占用 (每句): {np.mean(gpu_memory_baseline_val):.2f} MB")


print("\n----- 实验完成 -----")
print("请分析实验结果，包括定量评估 (Spearman 相关系数) - Layer Exploration,  Baseline 的 Spearman 相关系数, 推理时间和 GPU 内存占用。")
print("可以根据结果，进一步分析不同 RWKV Layer 的性能差异，以及 RWKV 与 Baseline 的对比。")
print("根据实验结果，撰写论文的研究结果部分。")

----- 1. 环境准备 -----
Using device: cuda
GPU name: Tesla T4
GPU memory available: 14.74 GB
----- 2. 加载预训练 RWKV 模型和 Tokenizer -----
RWKV 模型和 Tokenizer 使用 Transformers 库加载成功！
----- 3. 数据集准备 -----
数据集 glue/mrpc 加载完成！
训练集大小: 1000, 验证集大小: 408
数据集 Tokenization 完成！
----- 4. RWKV 句子嵌入函数 (支持不同层) -----
RWKV 句子嵌入函数定义完成 (支持不同层)！
----- 5. 基线句子嵌入函数 (平均 GloVe 词向量) -----
尝试加载 GloVe 词向量: glove.6B.50d.txt
GloVe 词向量加载完成，词汇量: 400000
基线句子嵌入函数定义完成！
----- 6. 语义相似度计算函数 -----
余弦相似度计算函数定义完成！
----- 7. 实验执行与评估 (训练集) - Layer Exploration -----
--- Evaluating RWKV Layer: 1 ---
RWKV (Layer 1) 训练集推理时间: 337.4105 秒
Spearman 相关系数 (RWKV - Layer 1, 训练集): 0.2879
--- Evaluating RWKV Layer: 3 ---
RWKV (Layer 3) 训练集推理时间: 335.5789 秒
Spearman 相关系数 (RWKV - Layer 3, 训练集): 0.2766
--- Evaluating RWKV Layer: 5 ---
RWKV (Layer 5) 训练集推理时间: 334.9683 秒
Spearman 相关系数 (RWKV - Layer 5, 训练集): 0.2671
--- Evaluating RWKV Layer: 7 ---
RWKV (Layer 7) 训练集推理时间: 330.9278 秒
Spearman 相关系数 (RWKV - Layer 7, 训练集): 0.2491
--- Evaluating RWKV Layer: 9 ---
R