# 主要用来测试FAISS的速度

In [1]:
from llama_index.core import (
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import Document

In [21]:
import json
data_path = "/root/ZeroKaraNoRAG/llamaindex_rag/data/train_dataset_single.json"

split_docs = []
with open(data_path, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

for i in range(len(json_data)):
    question = json_data[i]['conversation'][0]['input']
    if "做" not in question:
        question += "的做法"
    answer = json_data[i]['conversation'][0]['output']
    split_docs.append(Document(text=question+"\n"+answer))

In [29]:
from llama_index.core.node_parser import SentenceSplitter
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(split_docs)

In [30]:
len(nodes)

1036812

准备嵌入模型

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
Settings.embed_model = HuggingFaceEmbedding(model_name="models/bce-embedding-base_v1")

https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/

https://github.com/run-llama/llama_index/issues/9416

https://docs.llamaindex.ai/en/stable/examples/vector_stores/FaissIndexDemo/

In [None]:
import faiss
faiss_index=faiss.index_factory(768,"HNSW64",faiss.METRIC_L2) # embedding的维度，这里用的bce
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes, storage_context=storage_context,show_progress=True,insert_batch_size=10240
)

In [32]:
index.storage_context.persist()

In [63]:
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

准备本地大模型

In [64]:
from llama_index.core import PromptTemplate
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

def completion_to_prompt(completion):
    # 需要严格对应模型的对话模板
    return f"<|im_start|>system\n<|im_end|>\n<|im_start|>user\n{completion}<|im_end|>\n<|im_start|>assistant\n"

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
        elif message.role == "user":
            prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
        elif message.role == "assistant":
            prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"

    if not prompt.startswith("<|im_start|>system\n"):
        prompt = "<|im_start|>system\n<|im_end|>\n" + prompt

    prompt = prompt + "<|im_start|>assistant\n"

    return prompt




Settings.llm = HuggingFaceLLM(
    model_name="/root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-7b",
    tokenizer_name="/root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-7b",
    context_window=2048,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.7, "top_k": 10, "top_p": 0.75},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    device_map="auto",
    model_kwargs=dict(trust_remote_code=True,torch_dtype=torch.bfloat16,do_sample=True), # 只能这样设置，会传回原来的huggingface接口
    tokenizer_kwargs=dict(trust_remote_code=True)
)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [65]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor


retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=25,
)
response_synthesizer = get_response_synthesizer()

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.3)],
)


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs


In [66]:
response = query_engine.query("如何制作牛排比较好呢")
print(response)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs




In [None]:
response = query_engine.query("如何制作牛排比较好呢，请在回答的同时加入一些emoji来增加感情色彩")
print(response)