In [1]:
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
import faiss
import os
import torch

In [7]:
# Cell 2: 定义文件路径
# 请根据实际文件路径修改
item_file_path = "../Yelp/unique_items.jsonl"  # 原始 JSONL 文件路径
output_embedding_file = "../Yelp/embeddings.npy"  # 嵌入保存路径
output_metadata_file = "../Yelp/metadata.json"   # 元数据保存路径
output_faiss_file = "../Yelp/faiss_index.bin"    # FAISS 索引保存路径

In [8]:
# Cell 3: 定义 json2text 转换函数
def json2text(dataset, item):
    """将 JSON 对象转换为自然语言文本"""
    if dataset == "amazon":
        title = f"Title: {item.get('ItemName', '')}"
        categories = f"Categories: {', '.join(item.get('Categories', []))}"
        description = f"Description: {item.get('Description', '')}"
        text = "; ".join([title, categories, description])
    elif dataset == "yelp":
        title = f"BusinessName: {item.get('BusinessName', '')}"
        categories = f"Categories: {', '.join(item.get('Categories', []))}"
        description = f"Description: {item.get('Description', '')}"
        text = "; ".join([title, categories, description])
    return text

In [9]:

# Cell 4: 加载 JSONL 文件并转换为文本
def load_and_convert_to_text(file_path):
    """加载 JSONL 文件并将每条记录转换为文本"""
    items = []
    texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Loading and converting items"):
            item = json.loads(line.strip())
            items.append(item)             # 保存原始 JSON 数据
            text = json2text("yelp", item)         # 转换为自然语言文本
            texts.append(text)
    return items, texts

# 加载并转换数据
items, texts = load_and_convert_to_text(item_file_path)
    

Loading and converting items: 2471it [00:00, 49834.47it/s]


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载本地模型并将其移动到 GPU（如果有）
model = SentenceTransformer('../../crs/tools/all-MiniLM-L6-v2')
model = model.to(device)  # 将模型移动到 GPU（如果可用）
    

In [11]:
# Cell 6: 生成嵌入
def generate_embeddings(texts, model):
    """生成嵌入向量，并添加 tqdm 进度条"""
    embeddings = []
    for text in tqdm(texts, desc="Generating embeddings"):
        embedding = model.encode([text])
        embeddings.append(embedding[0])  # 每次取出生成的嵌入
    return np.array(embeddings)

# 生成嵌入向量
embeddings = generate_embeddings(texts, model)

# 检查嵌入结果
print(f"Generated embeddings shape: {embeddings.shape}")

Generating embeddings: 100%|██████████| 2471/2471 [00:44<00:00, 55.43it/s] 

Generated embeddings shape: (2471, 384)





In [12]:
# Cell 7: 保存嵌入、元数据和构建 FAISS 索引
def save_all(embeddings, items, embedding_file, metadata_file, faiss_file):
    """保存嵌入、元数据，并构建和保存 FAISS 索引"""
    # 保存嵌入为 npy 文件
    np.save(embedding_file, embeddings)
    print(f"Embeddings saved to: {os.path.abspath(embedding_file)}")

    # 保存元数据为 JSON 文件
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(texts, f, ensure_ascii=False, indent=4)
    print(f"Metadata saved to: {os.path.abspath(metadata_file)}")

    # 构建 FAISS 索引并保存
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # 使用 L2 距离
    index.add(embeddings)                # 添加嵌入向量
    faiss.write_index(index, faiss_file)
    print(f"FAISS index saved to: {os.path.abspath(faiss_file)}")

# 保存嵌入、元数据，并构建索引
save_all(embeddings, items, output_embedding_file, output_metadata_file, output_faiss_file)

Embeddings saved to: /data/fxy/ecpo/raw_data/Yelp/embeddings.npy
Metadata saved to: /data/fxy/ecpo/raw_data/Yelp/metadata.json
FAISS index saved to: /data/fxy/ecpo/raw_data/Yelp/faiss_index.bin


In [13]:
# Cell 8: 验证保存结果
# 验证嵌入文件
loaded_embeddings = np.load(output_embedding_file)
print(f"Number of embeddings: {loaded_embeddings.shape[0]}")

# 验证元数据文件
with open(output_metadata_file, 'r', encoding='utf-8') as f:
    loaded_metadata = json.load(f)
print(f"Number of metadata entries: {len(loaded_metadata)}")

# 验证 FAISS 索引
index = faiss.read_index(output_faiss_file)
print(f"FAISS index loaded with {index.ntotal} entries")

Number of embeddings: 2471
Number of metadata entries: 2471
FAISS index loaded with 2471 entries


In [14]:
def load_index_and_metadata(index_file, metadata_file):
    # 加载索引
    index = faiss.read_index(index_file)
    # 加载元数据
    with open(metadata_file, 'r', encoding='utf-8') as f:
        items = json.load(f)
    return index, items

def query_index(query_text, index, items, model, top_k=5):
    # 对查询文本生成嵌入
    query_embedding = model.encode([query_text])
    # 查询索引
    distances, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
    # 返回最近邻结果
    results = [{"Item": items[idx], "Distance": distances[0][i]} for i, idx in enumerate(indices[0])]
    return results


