In [None]:
#generate paragraph.pkl

In [None]:
import re
import pickle
import os

def is_english_regex(s):
    return bool(re.match('^[a-zA-Z]+$', s))

total_lines_processed = 0
empty_after_filter_count = 0
all_processed_paragraphs = []

for i in range(7):
    file_path = f'data/origin/filter-22-23_{i}.csv'
    print(f"正在处理文件: {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                cleaned_line = line.strip()
                
                if not cleaned_line:
                    continue
                
                total_lines_processed += 1
                seg_list = cleaned_line.split()
                
                result = [w for w in seg_list if not is_english_regex(w)]
                
                if not result:
                    empty_after_filter_count += 1
                    continue
                
                all_processed_paragraphs.append(result)

    except FileNotFoundError:
        print(f"警告：文件 {file_path} 未找到，已跳过。")
        continue

print("\n--- 所有文件处理完成，开始去重和统计 ---")

count_before_dedup = len(all_processed_paragraphs)

unique_paragraphs_set = {tuple(p) for p in all_processed_paragraphs}

final_unique_paragraphs = [list(p) for p in unique_paragraphs_set]

count_after_dedup = len(final_unique_paragraphs)

print("\n--- 统计结果 ---")
print(f"处理的原始总行数 (非空): {total_lines_processed}")
print(f"因过滤英文后变为空行而被移除的数量: {empty_after_filter_count}")
print(f"去重前的有效段落总数: {count_before_dedup}")
print(f"移除的重复段落数: {count_before_dedup - count_after_dedup}")
print(f"去重后的最终独立段落数: {count_after_dedup}")



output_dir = 'data/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
output_pickle_path = os.path.join(output_dir, 'paragraph.pkl')

print(f"\n正在将 {count_after_dedup} 个独立段落保存为pickle文件: {output_pickle_path}")
with open(output_pickle_path, 'wb') as f: 
    pickle.dump(final_unique_paragraphs, f)

print("保存完成。")

In [None]:
#extract doc_vectors of paragraph

In [None]:
import os
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import time

INPUT_PICKLE_PATH = 'data/paragraph.pkl'

OUTPUT_VECTORS_PATH = 'data/doc_vectors.npy'

MODEL_ID = 'BAAI/bge-large-zh-v1.5'

def extract_document_vectors():
    output_dir = os.path.dirname(OUTPUT_VECTORS_PATH)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    if not os.path.exists(INPUT_PICKLE_PATH):
        raise FileNotFoundError(f"输入文件未找到，请检查路径: {INPUT_PICKLE_PATH}")

    print(f"--- 正在从 {INPUT_PICKLE_PATH} 加载预分词的文本... ---")
    with open(INPUT_PICKLE_PATH, 'rb') as f:
        processed_texts = pickle.load(f)
    print(f"--- 成功加载 {len(processed_texts)} 篇文档。---")

    print("\n--- 正在将分词列表拼接成无空格的字符串... ---")
    pseudo_raw_documents = ["".join(doc) for doc in processed_texts]
    print(f"--- 成功准备 {len(pseudo_raw_documents)} 个字符串用于编码。---")

    if pseudo_raw_documents:
        print(f"处理后示例文本: '{pseudo_raw_documents[0][:70]}...'")

    print(f"\n--- 正在加载模型: {MODEL_ID} ---")
    print("--- 首次运行会自动从Hugging Face Hub下载模型（约1.3GB），请耐心等待... ---")
    model = SentenceTransformer(MODEL_ID)
    print("--- 模型加载成功。---")

    print("\n--- 开始提取文档向量... ---")
    print(f"--- 这将处理 {len(pseudo_raw_documents)} 篇文档，可能需要较长时间... ---")
    start_time = time.time()

    document_vectors = model.encode(
        pseudo_raw_documents, 
        show_progress_bar=True, 
        batch_size=32  
    )

    end_time = time.time()
    print(f"\n--- 向量提取完成！耗时: {end_time - start_time:.2f} 秒 ---")

    print("\n--- 正在将向量保存到文件... ---")
    np.save(OUTPUT_VECTORS_PATH, document_vectors)
    print(f"成功将向量保存到: {OUTPUT_VECTORS_PATH}")
    print(f"向量的形状为: {document_vectors.shape}")
    print(f"这代表 {document_vectors.shape[0]} 篇文档，每篇由一个 {document_vectors.shape[1]} 维的向量表示。")


if __name__ == '__main__':
    extract_document_vectors()

In [None]:
# CRL dimensionality reduction techniques

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

INPUT_VECTORS_PATH = "data/doc_vectors.npy"   
SAVE_MODEL_PATH    = "data/4-1/CRL/model/contrastive_model.pt"   

EPOCHS     = 20
BATCH_SIZE = 512
LR         = 1e-3
TEMPERATURE = 0.5

INPUT_DIM  = 1024   
HIDDEN_DIM = 512
PROJ_DIM   = 128


def augment(v, noise_scale=0.01, drop_prob=0.05):
    mask = (torch.rand_like(v) > drop_prob).float()
    v_dropped = v * mask

    noise = torch.randn_like(v_dropped) * noise_scale
    return v_dropped + noise


class ContrastiveDataset(Dataset):
    def __init__(self, vectors):
        self.vectors = torch.tensor(vectors, dtype=torch.float32)

    def __len__(self):
        return len(self.vectors)

    def __getitem__(self, idx):
        x = self.vectors[idx]
        return augment(x), augment(x)


class ContrastiveModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, proj_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),   
            nn.ReLU(),
            nn.Linear(hidden_dim, proj_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        return F.normalize(z, dim=-1)

def nt_xent_loss(z1, z2, temperature=0.5):
    N = z1.size(0)
    z = torch.cat([z1, z2], dim=0)          
    sim = F.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0), dim=-1)
    sim = sim / temperature

    mask = torch.eye(2*N, device=z.device).bool()
    sim.masked_fill_(mask, -9e15)

    labels = torch.arange(N, device=z.device)
    labels = torch.cat([labels + N, labels], dim=0)

    return F.cross_entropy(sim, labels)


def train_model(vectors):
    dataset = ContrastiveDataset(vectors)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

    model = ContrastiveModel(INPUT_DIM, HIDDEN_DIM, PROJ_DIM).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=len(loader)*EPOCHS
    )

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for x1, x2 in tqdm(loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            x1, x2 = x1.cuda(), x2.cuda()
            z1, z2 = model(x1), model(x2)
            loss = nt_xent_loss(z1, z2, TEMPERATURE)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step() 

            total_loss += loss.item()

        print(f"Epoch {epoch+1}: loss={total_loss/len(loader):.4f}")

    torch.save(model.state_dict(), SAVE_MODEL_PATH)
    print(f"模型已保存到 {SAVE_MODEL_PATH}")

if __name__ == "__main__":
    import numpy as np
    vectors = np.load(INPUT_VECTORS_PATH)
    train_model(vectors)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
import os

INPUT_VECTORS_PATH = "data/doc_vectors.npy"
SAVED_MODEL_PATH   = "data/4-1/CRL/model/contrastive_model.pt"
OUTPUT_REDUCED_VECTORS_PATH = "data/4-1/CRL/reduced_vectors_128d.npy"

INPUT_DIM  = 1024
HIDDEN_DIM = 512
PROJ_DIM   = 128 
BATCH_SIZE = 1024 

class ContrastiveModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, proj_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, proj_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        return F.normalize(z, dim=-1)

def run_inference():
    if not os.path.exists(INPUT_VECTORS_PATH):
        print(f"错误: 输入向量文件未找到 -> {INPUT_VECTORS_PATH}")
        return
    if not os.path.exists(SAVED_MODEL_PATH):
        print(f"错误: 训练好的模型文件未找到 -> {SAVED_MODEL_PATH}")
        return

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"正在使用设备: {device}")

    print(f"正在从 {SAVED_MODEL_PATH} 加载模型...")
    model = ContrastiveModel(INPUT_DIM, HIDDEN_DIM, PROJ_DIM).to(device)
    model.load_state_dict(torch.load(SAVED_MODEL_PATH, map_location=device))

    model.eval()
    print("模型加载成功，已切换到评估模式。")

    print(f"正在从 {INPUT_VECTORS_PATH} 加载原始向量...")
    original_vectors = np.load(INPUT_VECTORS_PATH)

    dataset = TensorDataset(torch.tensor(original_vectors, dtype=torch.float32))
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    print(f"成功加载 {len(original_vectors)} 个向量，准备开始降维...")

    all_reduced_vectors = []

    with torch.no_grad():
        for (batch,) in tqdm(loader, desc="[降维中]"):
            batch = batch.to(device)
            reduced_batch = model(batch)
            all_reduced_vectors.append(reduced_batch.cpu().numpy())

    print("降维完成，正在整理并保存结果...")
    final_vectors = np.concatenate(all_reduced_vectors, axis=0)

    output_dir = os.path.dirname(OUTPUT_REDUCED_VECTORS_PATH)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    np.save(OUTPUT_REDUCED_VECTORS_PATH, final_vectors)

    print("\n操作成功！")
    print(f"原始向量形状: {original_vectors.shape}")
    print(f"降维后向量形状: {final_vectors.shape}")
    print(f"降维后的向量已保存到: {OUTPUT_REDUCED_VECTORS_PATH}")

if __name__ == "__main__":
    run_inference()

In [None]:
# PCA＆UMAP 128

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import umap
import os
import time

INPUT_VECTORS_PATH = "data/doc_vectors.npy"

OUTPUT_DIR = "data/4-1/"
OUTPUT_PCA_PATH = os.path.join(OUTPUT_DIR, "PCA/reduced_vectors_pca_128d.npy")
OUTPUT_PCA_NORM_PATH = os.path.join(OUTPUT_DIR, "PCA/reduced_vectors_pca_128d_norm.npy")
OUTPUT_UMAP_PATH = os.path.join(OUTPUT_DIR, "UMAP/reduced_vectors_umap_128d.npy")
OUTPUT_UMAP_NORM_PATH = os.path.join(OUTPUT_DIR, "UMAP/reduced_vectors_umap_128d_norm.npy")

TARGET_DIM = 128
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

def normalize_l2_numpy(vectors: np.ndarray) -> np.ndarray:  
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)   
    norms = np.where(norms == 0, 1e-8, norms)
    normalized_vectors = vectors / norms    
    return normalized_vectors

def reduce_with_pca(vectors, target_dim):
    print("\n--- 开始执行 PCA 降维 ---")
    start_time = time.time()
    
    pca = PCA(n_components=target_dim, random_state=RANDOM_STATE)
    reduced_vectors = pca.fit_transform(vectors)
    
    end_time = time.time()
    
    explained_variance = pca.explained_variance_ratio_.sum()
    print(f"PCA 保留的总方差比例: {explained_variance:.4f}")
    print(f"PCA 降维完成，耗时: {end_time - start_time:.2f} 秒")
    
    return reduced_vectors


def reduce_with_umap(vectors, target_dim):
    print("\n--- 开始执行 UMAP 降维 (使用 cosine 度量) ---")
    start_time = time.time()
    
    reducer = umap.UMAP(
        n_components=target_dim,
        n_neighbors=UMAP_N_NEIGHBORS,
        min_dist=UMAP_MIN_DIST,
        metric='cosine',
        random_state=RANDOM_STATE,
        verbose=True
    )
    
    reduced_vectors = reducer.fit_transform(vectors)
    
    end_time = time.time()
    print(f"UMAP 降维完成，耗时: {end_time - start_time:.2f} 秒")
    
    return reduced_vectors

if __name__ == "__main__":
    if not os.path.exists(INPUT_VECTORS_PATH):
        print(f"错误: 输入文件未找到 -> {INPUT_VECTORS_PATH}")
    else:
        print(f"正在从 {INPUT_VECTORS_PATH} 加载原始向量...")
        original_vectors = np.load(INPUT_VECTORS_PATH).astype(np.float32)
        print(f"成功加载 {original_vectors.shape[0]} 个向量，原始维度为 {original_vectors.shape[1]}")

        pca_reduced_vectors = reduce_with_pca(original_vectors, TARGET_DIM)

        np.save(OUTPUT_PCA_PATH, pca_reduced_vectors)
        print(f"PCA (未归一化) 向量已保存到: {OUTPUT_PCA_PATH}")
        print(f"   -> 形状: {pca_reduced_vectors.shape}")

        print("   -> 正在对PCA输出进行L2归一化...")
        pca_reduced_vectors_norm = normalize_l2_numpy(pca_reduced_vectors)
        
        np.save(OUTPUT_PCA_NORM_PATH, pca_reduced_vectors_norm)
        print(f"PCA (L2归一化) 向量已保存到: {OUTPUT_PCA_NORM_PATH}")
        print(f"   -> 形状: {pca_reduced_vectors_norm.shape}")

        umap_reduced_vectors = reduce_with_umap(original_vectors, TARGET_DIM)

        np.save(OUTPUT_UMAP_PATH, umap_reduced_vectors)
        print(f"\nUMAP (未归一化) 向量已保存到: {OUTPUT_UMAP_PATH}")
        print(f"   -> 形状: {umap_reduced_vectors.shape}")

        print("   -> 正在对UMAP输出进行L2归一化...")
        umap_reduced_vectors_norm = normalize_l2_numpy(umap_reduced_vectors)
        
        np.save(OUTPUT_UMAP_NORM_PATH, umap_reduced_vectors_norm)
        print(f"UMAP (L2归一化) 向量已保存到: {OUTPUT_UMAP_NORM_PATH}")
        print(f"   -> 形状: {umap_reduced_vectors_norm.shape}")

        print("\n所有降维任务已完成！")

In [None]:
# PCA 64

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import os
import time

INPUT_VECTORS_PATH = "data/doc_vectors.npy"
OUTPUT_DIR = "data/4-1/64/"

TARGET_DIMS_LIST = [64]
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

def normalize_l2_numpy(vectors: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    norms = np.where(norms == 0, 1e-8, norms)
    normalized_vectors = vectors / norms
    return normalized_vectors

def reduce_with_pca(vectors, target_dim):
    print(f"\n--- 开始执行 PCA 降维至 {target_dim} 维 ---")
    start_time = time.time()
    
    pca = PCA(n_components=target_dim, random_state=RANDOM_STATE)
    reduced_vectors = pca.fit_transform(vectors)
    
    end_time = time.time()
    explained_variance = pca.explained_variance_ratio_.sum()
    
    print(f"目标维度: {target_dim}")
    print(f"保留的总方差比例: {explained_variance:.4f}")
    print(f"PCA 降维完成，耗时: {end_time - start_time:.2f} 秒")
    
    return reduced_vectors

if __name__ == "__main__":
    if not os.path.exists(INPUT_VECTORS_PATH):
        print(f"错误: 输入文件未找到 -> {INPUT_VECTORS_PATH}")
    else:
        print(f"正在从 {INPUT_VECTORS_PATH} 加载原始向量...")
        original_vectors = np.load(INPUT_VECTORS_PATH).astype(np.float32)
        print(f"成功加载 {original_vectors.shape[0]} 个向量，原始维度为 {original_vectors.shape[1]}")

        for dim in TARGET_DIMS_LIST:
            pca_reduced_vectors = reduce_with_pca(original_vectors, dim)
            
            print("   -> 正在对PCA输出进行L2归一化...")
            pca_reduced_vectors_norm = normalize_l2_numpy(pca_reduced_vectors)
            
            output_filename = f"reduced_vectors_pca_{dim}d_norm.npy"
            output_path = os.path.join(OUTPUT_DIR, output_filename)
            
            np.save(output_path, pca_reduced_vectors_norm)
            
            print(f"PCA (L2归一化) 向量已保存到: {output_path}")
            print(f"   -> 形状: {pca_reduced_vectors_norm.shape}")

        print("\n所有维度的降维任务已全部完成！")

In [None]:
# PCA 256

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import os
import time

INPUT_VECTORS_PATH = "data/doc_vectors.npy"

OUTPUT_DIR = "data/4-1/256/"

TARGET_DIMS_LIST = [256]
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

def normalize_l2_numpy(vectors: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    norms = np.where(norms == 0, 1e-8, norms)
    normalized_vectors = vectors / norms
    return normalized_vectors

def reduce_with_pca(vectors, target_dim):
    print(f"\n--- 开始执行 PCA 降维至 {target_dim} 维 ---")
    start_time = time.time()
    
    pca = PCA(n_components=target_dim, random_state=RANDOM_STATE)
    reduced_vectors = pca.fit_transform(vectors)
    
    end_time = time.time()

    explained_variance = pca.explained_variance_ratio_.sum()
    
    print(f"目标维度: {target_dim}")
    print(f"保留的总方差比例: {explained_variance:.4f}")
    print(f"PCA 降维完成，耗时: {end_time - start_time:.2f} 秒")
    
    return reduced_vectors

if __name__ == "__main__":
    if not os.path.exists(INPUT_VECTORS_PATH):
        print(f"错误: 输入文件未找到 -> {INPUT_VECTORS_PATH}")
    else:
        print(f"正在从 {INPUT_VECTORS_PATH} 加载原始向量...")
        original_vectors = np.load(INPUT_VECTORS_PATH).astype(np.float32)
        print(f"成功加载 {original_vectors.shape[0]} 个向量，原始维度为 {original_vectors.shape[1]}")

        for dim in TARGET_DIMS_LIST:
            pca_reduced_vectors = reduce_with_pca(original_vectors, dim)

            print("   -> 正在对PCA输出进行L2归一化...")
            pca_reduced_vectors_norm = normalize_l2_numpy(pca_reduced_vectors)

            output_filename = f"reduced_vectors_pca_{dim}d_norm.npy"
            output_path = os.path.join(OUTPUT_DIR, output_filename)
            
            np.save(output_path, pca_reduced_vectors_norm)
            
            print(f"PCA (L2归一化) 向量已保存到: {output_path}")
            print(f"   -> 形状: {pca_reduced_vectors_norm.shape}")

        print("\n所有维度的降维任务已全部完成！")