In [81]:
import os 
os.listdir("../../myfile/")

['volumes',
 '.ipynb_checkpoints',
 'README.md',
 'FAQ_embeddings_20230906.npy',
 'FAQ_embeddings.npy',
 '.git',
 'docker-compose.yml']

E0908 01:52:09.078355758    2699 ssl_transport_security.cc:1420]       Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.


In [13]:
import hashlib
import math
from pathlib import Path

import numpy as np
from dotenv import load_dotenv
from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    connections,
    utility,
)

# from config import CONFIG

# from data_file_path import VECTOR_DATA, 
#FAQ_ALL_DATA = "../../myfile/FAQ_embeddings.npy"
FAQ_ALL_DATA = "../../myfile/FAQ_embeddings_20230906.npy"

In [14]:
load_dotenv()

# MILVUS_CONFIG = CONFIG["Milvus"]
MILVUS_HOST = "3.68.74.215"
MILVUS_PORT = "19530"
COLLECTION_NAME = "MA_FAQ_CACHE"
# COLLECTION_NAME = "FAQ_feat"

In [15]:
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)

In [29]:
def generate_unique_id(string: str) -> int:
    hashed_string = hashlib.sha256(string.encode()).hexdigest()
    int_id = int(hashed_string, 16) % (2**63 - 1)
    return int_id

In [30]:
def create_schema():
    """TODO: create schema for milvus, the filed is defined
    Returns: TODO

    """

    # PK
    ids = FieldSchema(
        name="id",
        dtype=DataType.INT64,
        is_primary=True,
        # auto_id = True
    )
    # question
    question = FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=200)
    # vector
    features = FieldSchema(name="features", dtype=DataType.FLOAT_VECTOR, dim=1536)
    meta = FieldSchema(name="meta", dtype=DataType.VARCHAR, max_length=2000)
    schema = CollectionSchema(
        fields=[
            ids,
            question,
            meta,
            features,
        ],
        description="FAQ for MA",
    )
    return schema

In [31]:
def load_data_from_npy(npy_file, batch_size=5000):
    """load the data from numpy
    id: 1st column
    meta: 2nd column
    vector: 3rd to end columns (1536)
    Returns: TODO

    """
    data = np.load(npy_file, allow_pickle=True)
    
    data_size = data.shape[1]
    split_num = int(data_size/batch_size) + 1
    
    res = []
    
    for data_batch in np.array_split(data, split_num, axis=1): 
        # data = data[:, :10500]
        print(data_batch.shape)

        # Get the 1st column values into a variable
        question = data_batch[0]

        ids = [generate_unique_id(q) for q in question]

        # Get the 2nd column values into another variable
        meta = data_batch[1]
        # __import__("ipdb").set_trace()

        # Get all other columns (from the 3rd one onwards) into another variable
        vec = data_batch[2]
        
        # res.append([ids, question, meta, vec])
        res.append([ids,question, meta, vec])
    return res

In [32]:
def load_data_from_npy_to_db(npy_file: Path, is_drop_collection=True):
    # create collection if not exist

    if is_drop_collection:
        utility.drop_collection(COLLECTION_NAME)
    if not utility.has_collection(COLLECTION_NAME):
        schema = create_schema()
        collection = Collection(
            name=COLLECTION_NAME,
            schema=schema,
            using="default",
            shards_num=4,
            consistency_level="Strong",
        )
    # check collection number
    collection = Collection(COLLECTION_NAME)
    print(
        "Before Insert data, there is {0} records in collection {1}".format(
            collection.num_entities, COLLECTION_NAME
        )
    )
    
    # load data
    data = load_data_from_npy(npy_file)
    for i, data_batch in enumerate(data):
        
        mr = collection.insert(data_batch)
        print(mr)
        print(
            "After Insert data batch {0} , there is {1} records in collection {2}".format(
                i, collection.num_entities, COLLECTION_NAME
            )
        )
        
        # create index
        index_params = {
            "metric_type": "L2",
            # "metric_type": "IP",
            # "index_type": "IVF_FLAT",
            "index_type": "FLAT",
            "params": {"nlist": 1024},
        }
        collection.create_index(field_name="features", index_params=index_params)

        print("Index Created")
        collection.flush()
        print("collection flushed")
        
    print(" Total: there is {0} records in collection {1}".format(collection.num_entities, COLLECTION_NAME))

In [36]:
# if __name__ == "__main__":
load_data_from_npy_to_db(FAQ_ALL_DATA, True)

Before Insert data, there is 0 records in collection MA_FAQ_CACHE
(3, 4497)
(3, 4496)
(3, 4496)
(3, 4496)
(insert count: 4497, delete count: 0, upsert count: 0, timestamp: 444114540015648771, success count: 4497, err count: 0)
After Insert data batch 0 , there is 0 records in collection MA_FAQ_CACHE
Index Created
collection flushed
(insert count: 4496, delete count: 0, upsert count: 0, timestamp: 444114541392166913, success count: 4496, err count: 0)
After Insert data batch 1 , there is 4497 records in collection MA_FAQ_CACHE
Index Created
collection flushed
(insert count: 4496, delete count: 0, upsert count: 0, timestamp: 444114542912339969, success count: 4496, err count: 0)
After Insert data batch 2 , there is 8993 records in collection MA_FAQ_CACHE
Index Created
collection flushed
(insert count: 4496, delete count: 0, upsert count: 0, timestamp: 444114544301703169, success count: 4496, err count: 0)
After Insert data batch 3 , there is 13489 records in collection MA_FAQ_CACHE
Index

In [16]:
collection = Collection(COLLECTION_NAME)
collection.load()
print(
    "There is {0} records in collection {1}".format(
        collection.num_entities, COLLECTION_NAME
    )
)
print(collection.indexes)

There is 17985 records in collection MA_FAQ_CACHE
[<pymilvus.orm.index.Index object at 0x7fe748207fd0>]


In [38]:
print(collection.schema)                # Return the schema.CollectionSchema of the collection.
print(collection.description)           # Return the description of the collection.

{
  auto_id: False
  description: FAQ for MA
  fields: [{
    name: id
    description: 
    type: 5
    is_primary: True
    auto_id: False
  }, {
    name: question
    description: 
    type: 21
    params: {'max_length': 200}
  }, {
    name: meta
    description: 
    type: 21
    params: {'max_length': 2000}
  }, {
    name: features
    description: 
    type: 101
    params: {'dim': 1536}
  }]
}
FAQ for MA


In [39]:
print(collection.name     )             # Return the name of the collection.
print(collection.is_empty )             # Return the boolean value that indicates if the collection is empty.

MA_FAQ_CACHE
False


In [40]:
print(collection.num_entities )         # Return the number of entities in the collection.
print(collection.primary_field )        # Return the schema.FieldSchema of the primary key field.

17985
{
    name: id
    description: 
    type: 5
    is_primary: True
    auto_id: False
  }


In [41]:
print(collection.partitions)            # Return the list[Partition] object.
print(collection.indexes )              # Return the list[Index] object.

[{"name": "_default", "collection_name": "MA_FAQ_CACHE", "description": ""}]
[<pymilvus.orm.index.Index object at 0x7fc397506c50>]


In [42]:
collection.release()

## 测试

In [1]:
import fileinput
import os
import openai
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Milvus

In [2]:
COLLECTION_NAME = "MA_FAQ_CACHE"
MILVUS_HOST = "3.68.74.215"
MILVUS_PORT = "19530"

In [45]:
load_dotenv()

False

In [3]:
from langchain.embeddings.openai import OpenAIEmbeddings

endpoint_url = 'https://sandbox-openai-east-us.openai.azure.com/'
openai.api_type = "azure"
openai.api_base = endpoint_url
openai.api_version = "2023-03-15-preview"
openai.api_key = "911fe1b382ae49c08e99b98428ab8cee"
# The deployment name you chose when you deployed the ChatGPT or GPT-4 model.
ENGINE_NAME_GPT_35 = "gpt-35-turbo-version-0301"
ENGINE_NAME_GPT_4 = "gpt-4"
ENGINE_NAME_GPT_4_32k = "gpt-4-32k"
os.environ['OPENAI_API_KEY'] = "911fe1b382ae49c08e99b98428ab8cee"

EMBEDDINGS = OpenAIEmbeddings(
                    openai_api_base= endpoint_url,
                    openai_api_type='azure',
                    deployment='text-embedding-ada-002',
                    openai_api_key=openai.api_key,
                )

In [26]:
vector_store = Milvus(
    embedding_function=EMBEDDINGS,
    collection_name="FAQ_feat",
    # collection_name = "FAQ_feat",
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
    # primary_field = 'id',
)

ValueError: list.remove(x): x not in list

In [None]:
def _response_format(resp):
    """
    resp is the single response (doc,similarity)
    """
    F_OUTPUT_FORMAT = """
        问题:{q}
        答案:{a}
        相似度:{s:.2f}
    """.format
    return F_OUTPUT_FORMAT(
        q=resp[0].page_content.strip(),
        a=resp[0].metadata["meta"].strip(),
        s=resp[1],
    )

E0908 07:32:55.536906551    2699 ssl_transport_security.cc:1420]       Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.
E0908 07:35:00.353561355    2699 ssl_transport_security.cc:1420]       Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.
E0908 07:36:47.643699389    2699 ssl_transport_security.cc:1420]       Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.
E0908 07:38:51.531033634    2699 ssl_transport_security.cc:1420]       Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.
E0908 07:40:52.236142500    2699 ssl_transport_security.cc:1420]       Handshake failed with fatal error SSL_ERROR_SSL: error:100000f7:SSL routines:OPENSSL_internal:WRONG_VERSION_NUMBER.


In [None]:
def responses_format(resp):
    return "".join([_response_format(r) for r in resp])

In [None]:
def filter_response_with_threshold(
    responses, threshold: float = V_SIMILARITY_THRESHOLD
):
    """based on the threhold filter out the unsimilary document

    Args:
            responses (TODO): [(document,similary),(document,similary)]
            threshold (float): similarity_threshold

    Returns: TODO

    """
    similarity_idx = 1
    return [r for r in responses if r[similarity_idx] <= threshold]

In [None]:
def demo_with_input(n=1):
    """for demo purpose"""
    for line in fileinput.input():
        res = ""
        docs = vector_store.similarity_search_with_score(line.rstrip(), k=n)
        docs = filter_response_with_threshold(docs)
        if len(docs) == 0:
            print(NOT_FOUND_MESSAGE)
        print(responses_format(docs))


def query(query_text: str, n=3) -> str:
    """find the most similar response in db, based on query_text"""
    docs = vector_store.similarity_search_with_score(query_text.rstrip(), k=n) #tqy: Python rstrip() 删除 string 字符串末尾的指定字符，默认为空白符，包括空格、换行符、回车符、制表符
    docs = filter_response_with_threshold(docs)
    if len(docs) == 0:
        return NOT_FOUND_MESSAGE
    return responses_format(docs)

In [None]:
def test_func():
    query_text = "采集计划怎么做?"
    query_text2 = "完成DDI商业对接后，默认会提取多少个月的数据？ "
    print(query(query_text2))

In [None]:
if __name__ == "__main__":
    # demo_with_input()
    test_func()

## Milvus search

In [82]:
from langchain.embeddings.openai import OpenAIEmbeddings

endpoint_url = 'https://sandbox-openai-east-us.openai.azure.com/'
openai.api_type = "azure"
openai.api_base = endpoint_url
openai.api_version = "2023-03-15-preview"
openai.api_key = "911fe1b382ae49c08e99b98428ab8cee"
# The deployment name you chose when you deployed the ChatGPT or GPT-4 model.
ENGINE_NAME_GPT_35 = "gpt-35-turbo-version-0301"
ENGINE_NAME_GPT_4 = "gpt-4"
ENGINE_NAME_GPT_4_32k = "gpt-4-32k"
os.environ['OPENAI_API_KEY'] = "911fe1b382ae49c08e99b98428ab8cee"

embeddings = OpenAIEmbeddings(
                    openai_api_base= endpoint_url,
                    openai_api_type='azure',
                    deployment='text-embedding-ada-002',
                    openai_api_key=openai.api_key,
                )

In [83]:
collection = Collection("MA_FAQ_CACHE")
collection.load()

In [84]:
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}

In [117]:
text = ["退货处理原则"]
search = embeddings.embed_documents(text)

In [114]:
len(search)

1

In [118]:
results = collection.search(
    data=search , 
    anns_field="features", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=3,
    # expr="random > -12",
    output_fields=["meta"]
    # expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    # output_fields=['title'],
    # consistency_level="Strong"
)

In [79]:
results

<pymilvus.orm.search.SearchResult at 0x7fe723b67510>

In [93]:
print(len(results[0]))

3


In [96]:
results[0].distances

[0.5123574733734131, 0.5171717405319214, 0.5207128524780273]

In [119]:
hit = results[0][0]
hit

id: 3488033127907708768, distance: 0.0, entity: {'meta': '{"output_category": "FAQ", "text_response": {"standard_question": "退货处理原则", "response": "（1）不接受无正当理由或责任不应由拜耳承担的退换货要求\\n（2）只接受由一级经销商退回并且能够在SAP系统中查询到对应销售记录批次的产品。退回的药品必须与销售记录内容相符，批号一致，数量小于等于该批号销售数量"}}'}

In [None]:
for result in res[0]:
    meta = {x: result.entity.get(x) for x in output_fields}
    doc = Document(page_content=meta.pop(self._text_field), metadata=meta)
    pair = (doc, result.score)
    ret.append(pair)

In [58]:
collection.release()

In [None]:
dee

In [1]:
import os 
os.listdir("../../myfile/")

['volumes',
 '.ipynb_checkpoints',
 'question_embeddings_all.npy',
 'README.md',
 'FAQ_embeddings_20230906.npy',
 'FAQ_embeddings.npy',
 '.git',
 'docker-compose.yml']

In [4]:
all_question_embedding = "../../myfile/question_embeddings_all.npy"

In [None]:
data = np.load(all_question_embedding, allow_pickle=True)

In [None]:
import numpy as np

# 定义块大小
block_size = 1000

In [5]:
# 获取数组形状
shape = np.load(all_question_embedding, allow_pickle=True, mmap_mode='r' )

ValueError: Array can't be memory-mapped: Python objects in dtype.

In [6]:
import numpy as np

# 定义读取的行数和起始位置
rows = 10
start = 30

In [None]:
# 逐行读取
with open(all_question_embedding, 'rb') as f:
    # 跳过头部
    f.seek(128 + start * np.dtype('float32').itemsize * np.load(all_question_embedding, allow_pickle=True).shape[1])
    # 逐行读取
    for i in range(rows):
        data = np.fromfile(f, dtype=np.float32, count=np.load(all_question_embedding, allow_pickle=True).shape[1])

In [None]:
# 分割数组
slices = [slice(i, main(i + block_size, dim)) for i, dim in zip(range(0, shape[0], block_size), shape)]

# 逐个块读取
for sl in slices:
    data = np.load('large_array.npy', mmap_mode='r', allow_pickle=True, encoding=None, order='K', shape=tuple([sl]) + shape[1:])