# DVL - Dragon Version Log V1

这是一个使用全球整车发版日志进行搜索处理的，以期达到在少量数据上有较高的搜索准确率
v1 使用的数据集是 未经过摘要的原始发版日志数据集 20240228版本

In [None]:
!pip install -r requirements.txt

# 加载环境配置

In [None]:
%env ENV_FOR_DYNACONF=local
import os 
os.chdir('/Users/wangjialong/Documents/code/saic_project/global-vehicle-dragon/data-processing')

# 重新加载模块
from importlib import reload
from config.settings import Settings
import db.db_manager
import config.settings
import os
reload(config.settings)
reload(db.db_manager)
print(f"Debug: {bool(Settings.DEBUG)}")

# 查询源数据目录，加载到DB中

In [None]:
file_directory = r'/Users/wangjialong/Documents/AI/SAIC_DCSGlobalVehicle/DVL/prd_dvl_md_20240228'
files = os.listdir(file_directory)
file_count = sum(os.path.isfile(os.path.join(file_directory, item)) for item in files)
print(f"目录 '{file_directory}' 下有 {file_count} 个文件。")

In [None]:
from db.model.p_dragon import DVL
import hashlib
from etl.dragon_etl import save_dvl_model
from db.db_manager import DBConn

for index, item in enumerate(files, start=1):
    if item in ['.DS_Store', 'index.md']:
        continue;
    
    db_conn = DBConn()
    full_path = os.path.join(file_directory, item)
    hash_obj = hashlib.sha256()
    if os.path.isfile(full_path):
        print(f"处理文件 {index} |'{item}'")
        try:
            with open(full_path, 'r') as file:
                content = file.read()
                hash_obj.update(content.encode())
                new_model = DVL(
                    name = item,
                    hash = hash_obj.hexdigest(),
                    content = content
                )
            save_dvl_model(conn=db_conn, model=new_model)
        except Exception as e:
            print(f"Failed! {index} | {full_path} | {e}")
    db_conn.close()


In [None]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine


def sentence_to_vector(sentence):
    """
    定义模型转embedding
    :return embedding
    """
    # 加载分词器和模型
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BertModel.from_pretrained('bert-base-chinese')
    input = tokenizer(sentence, return_tensors='pt', padding='max_length', max_length=128)
    with torch.no_grad():
        output = model(**input)
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding

# 对数据库内的数据进行向量化

使用bert-base-chinese模型，对数据库的content进行分块嵌入到milvus的向量数据库中


In [None]:
from pymilvus import (
    MilvusClient,
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)
from sqlalchemy import func, create_engine
from towhee import pipe, ops
from db.milvus_manager import MilvusConn
from langchain.text_splitter import MarkdownTextSplitter
import pandas as pd
from db.db_manager import DBConn
from db.model.p_dragon import DVL

# 参数
database_name = 'dvl_v1'
# 最大文本的长度
max_token = 512
# 查询分区数量
partition_num = 100
# my_model_name = 'shibing624/text2vec-base-chinese'
my_model_name = 'bert-base-chinese'

connections.connect(host=Settings.MILVUS_URL, port=Settings.MILVUS_PORT)
client = MilvusClient(host=Settings.MILVUS_URL, port=Settings.MILVUS_PORT)

print(utility.has_collection(collection_name = database_name))
if utility.has_collection(collection_name = database_name):
    print(f"describe: {client.describe_collection(collection_name = database_name)}")
    print(f"num of entities: {client.num_entities(collection_name = database_name)}")

milvus_conn = MilvusConn(database_name)
db_conn = DBConn()

count_query = f"SELECT count(1) FROM p_dragon.dragon_version_log LIMIT {partition_num}"
total_num = db_conn.session.query(func.count(DVL.id)).scalar()
iteration_times = total_num // partition_num + (0 if total_num % partition_num == 0 else 1)
print(f"Records Count: {total_num} | iterations time: {iteration_times}")
db_conn.close()

result_pd = pd.DataFrame(columns=['name', 'embedding'])

# 这一块可以抽取相同代码块
for i in range(iteration_times):    
    engine = create_engine(Settings.DB_URL)
    start_num = i * partition_num
    select_query = f"SELECT id, name, content as content FROM p_dragon.dragon_version_log LIMIT {partition_num} OFFSET {start_num}"     
    with engine.connect() as connection, connection.begin():
        df = pd.read_sql_query(sql=select_query, con=connection.connection)
        for index, row in enumerate(df.itertuples(index=False), start=1):
            if row.id is None:
                break
            else:
                pass

            if len(row.content) > max_token:
                print(f"{row.id} 大于最大token长度，切割块")
                markdown_spilitter = MarkdownTextSplitter(chunk_size=max_token, chunk_overlap=0)
                docs = markdown_spilitter.create_documents([row.content])
                for split in docs:
                    normalize_vector = sentence_to_vector(split.page_content)
                    new_row = pd.DataFrame([[row.name, normalize_vector[0].numpy()]],
                        columns=['name', 'embedding'])
                    result_pd = pd.concat([result_pd, new_row])
            else:    
                print(f"处理数据 分片{i}|{index}")
                normalize_vector = sentence_to_vector(row.content)
                new_row = pd.DataFrame([[row.name, normalize_vector[0].numpy()]],
                        columns=['name', 'embedding'])
                result_pd = pd.concat([result_pd, new_row])

        milvus_conn.insert(result_pd)                
        engine.dispose()
milvus_conn.flush()
milvus_conn.stats()
print(f"all done") 
# print(f"data 0 : {df.loc[0]}")

In [14]:
# 根据简单的提示词，看下是否能正确的召回数据
prompt_word = 'UK By Rule Allocation Scene的配车报告是哪个版本上线的？'

embedding = sentence_to_vector(prompt_word)
# print(embedding)

# 参数
database_name = 'dvl_v1'
milvus_conn = MilvusConn(database_name)

search_params = {
    "metric_type": "L2", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 10}
}

milvus_conn.collection.load()
results = milvus_conn.collection.search(
    data = [embedding[0].numpy()],
    anns_field = 'embedding',
    param = search_params,
    limit = 5,
    expr = None,
    output_fields = ['name'],
    consistency_level = 'Strong'
)

print(f"查询结果：{results}")

2024-02-28 15:17:49,565 - 140704323120000 - milvus_client.py-milvus_client:553 - DEBUG: Created new connection using: 0cffc1f8c5f14ff5880fe05daab79ccc


查询结果：['["id: 448029453523382943, distance: 50.983673095703125, entity: {\'name\': \'1049852.md\'}", "id: 448029453523383167, distance: 71.80381774902344, entity: {\'name\': \'S8.7.20240103.PRD_13371786.md\'}", "id: 448029453523383000, distance: 71.80381774902344, entity: {\'name\': \'S9.3.20240202.PRD_13374147.md\'}", "id: 448029453523383113, distance: 71.80381774902344, entity: {\'name\': \'S7.10.20231110.PRD_9306660.md\'}", "id: 448029453523383158, distance: 75.08514404296875, entity: {\'name\': \'S6.13.20230915.PRD_1051659.md\'}"]']
