In [1]:
import os

from datasets import Dataset
from langchain.text_splitter import TokenTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai import OpenAIEmbeddings
from tqdm import tqdm, trange

## 0. 数据集样本
### 0.0 加载数据集

In [2]:
patent_data = Dataset.load_from_disk('/Users/xiaoen/Documents/科研/论文/GraphRAG/Code/DataDeal/PatentDataset')
patent_data

Dataset({
    features: ['Publication Number', 'Title', 'Abstract', 'Claims', 'Publication Date', 'CPC', 'Novelty', 'Purpose', 'Advantage', 'Patentee', 'Inventor', 'UPC', 'Cited Number', 'Citing Number', 'Family Number', 'Strategic Importance', 'Field Importance', 'Comprehensive Importance'],
    num_rows: 55120
})

In [3]:
cpc_data = Dataset.load_from_disk('/Users/xiaoen/Documents/科研/论文/GraphRAG/Code/DataDeal/CPC_info')
cpc_data

Dataset({
    features: ['level', 'symbol', 'classification', 'parent', 'children'],
    num_rows: 262609
})

In [4]:
cpc_dict = {}
for i in trange(len(cpc_data)):
    cpc_dict[cpc_data[i]["symbol"]] = cpc_data[i]["classification"]

100%|██████████| 262609/262609 [00:12<00:00, 21143.78it/s]


### 0.1 Title-Patent数据集

In [5]:
title_patent_data = {}
for i in trange(len(patent_data)):
    title_patent_data[patent_data[i]["Publication Number"]] = patent_data[i]['Title']

100%|██████████| 55120/55120 [00:09<00:00, 5865.83it/s]


### 0.2 Abstract-Patent数据集

In [6]:
abstract_patent_data = {}
for i in trange(len(patent_data)):
    abstract_patent_data[patent_data[i]["Publication Number"]] = patent_data[i]['Abstract']

100%|██████████| 55120/55120 [00:08<00:00, 6181.47it/s]


### 0.3 CPC-Interpretation数据集

In [7]:
all_cpc = []
for i in trange(len(patent_data)):
    all_cpc.extend(patent_data[i]['CPC'])

100%|██████████| 55120/55120 [00:04<00:00, 12547.03it/s]


In [8]:
all_cpc = list(set(all_cpc))
cpc_interpretation = {}
for cpc in all_cpc:
    try:
        cpc_interpretation[cpc] = cpc_dict[cpc]
    except:
        pass

## 1. 构建向量存储
### 1.0 准备环境

In [9]:
os.environ["OPENAI_API_KEY"] = "XXXX"

In [10]:
embedding_model = OpenAIEmbeddings(model='text-embedding-3-large', dimensions=1024)

### 1.1 生成向量存储

In [11]:
def get_text_embedding(data_dict, batch_size):
    sources = list(data_dict.keys())
    texts = list(data_dict.values())
    
    meta_datas = [{'source': source} for source in sources]
    emb_vectors = []
    for b in trange(0, len(texts), batch_size):
        emb_vectors.extend(embedding_model.embed_documents(texts[b:b+batch_size]))
    
    return zip(texts, emb_vectors), meta_datas

In [12]:
def save_vectorstore(data_dict, vectorstore_name, batch_size=32):
    data, meta_datas = get_text_embedding(data_dict, batch_size)
    vectorstore = FAISS.from_embeddings(data, embedding_model, metadatas=meta_datas)
    vectorstore.save_local(vectorstore_name)
    return vectorstore

In [13]:
vs = save_vectorstore(cpc_interpretation, 'cpc_interpretation', batch_size=512)

100%|██████████| 54/54 [01:56<00:00,  2.15s/it]


In [14]:
vs_title = save_vectorstore(title_patent_data, 'title_patent', batch_size=512)

100%|██████████| 108/108 [03:12<00:00,  1.78s/it]


In [15]:
vs_abstract = save_vectorstore(abstract_patent_data, 'abstract_patent', batch_size=512)

100%|██████████| 108/108 [08:58<00:00,  4.98s/it]


In [16]:
vs = FAISS.load_local('abstract_patent',embedding_model,allow_dangerous_deserialization=True)

In [17]:
vs.similarity_search_with_score("Use image recognition to control vehicle travel trajectory", k=2)

[(Document(metadata={'source': 'US20210300350A1'}, page_content='A vehicle control device includes a recognizer configured to recognize an object near a vehicle, a generator configured to generate one or more target trajectories, along which the vehicle travels, on the basis of the object, and a driving controller configured to automatically control driving of the vehicle on the basis of the target trajectories. The generator calculates a travelable area, which is an area where the vehicle is able to travel, on the basis of a state of the object, and excludes a target trajectory outside the calculated travelable area from the one or more generated target trajectories, and the driving controller automatically controls the driving of the vehicle on the basis of the target trajectory that remains without being excluded by the generator.'),
  0.7066597),
 (Document(metadata={'source': 'US20200117916A1'}, page_content='According to an embodiment, a system receives a captured image perceivin