In [1]:
from pymilvus import MilvusClient, DataType

# 1. Set up a Milvus client
client = MilvusClient(
    uri="http://localhost:19530"
)
collection_name = "property_listing"

In [2]:
# define schema
schema = MilvusClient.create_schema(
    auto_id=True,
    enable_dynamic_field=True,
)
schema.add_field(field_name="id", datatype=DataType.VARCHAR,
                is_primary=True, auto_id=True, max_length=100)
schema.add_field(field_name="doc_id", datatype=DataType.VARCHAR,max_length=250)
schema.add_field(field_name="document", datatype=DataType.VARCHAR, max_length=200)
schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=768)
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="embedding",
    index_type="FLAT",
    metric_type="IP",

)

In [3]:
client.drop_collection(
    collection_name=collection_name
)

In [4]:
# 2. Create a collection in quick setup mode
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
    dynamic_dim=True
)

In [5]:
import pandas as pd
df = pd.read_csv('../5.finetuning_SBERT/dataset_D_Q.csv', dtype=object)
# save the document col to a list
descriptions = df['document'].tolist()
ids = df['id'].tolist()
# ids = [str(n) for n in range(len(descriptions))]
len(descriptions)

21019

In [6]:
# # Loading Embedding model
from pymilvus import model
sentence_transformer_ef = model.dense.SentenceTransformerEmbeddingFunction(
    model_name='../5.finetuning_SBERT/sbert_test_mnr2', # Specify the model name
    device='cuda:0' # Specify the device to use, e.g., 'cpu' or 'cuda:0'
)
docs_embeddings = sentence_transformer_ef.encode_documents(descriptions)

print("Embeddings generated successfully")
print("Dim:", sentence_transformer_ef.dim, docs_embeddings[0].shape)

  from .autonotebook import tqdm as notebook_tqdm


Embeddings generated successfully
Dim: 768 (768,)


In [7]:
data = []
for i in range(len(descriptions)):
    row = {}
    row['doc_id'] = ids[i]
    row['document'] = descriptions[i]
    row['embedding'] = docs_embeddings[i].tolist()
    data.append(row)
len(data)

21019

In [8]:
client.insert(collection_name=collection_name, data=data)

{'insert_count': 21019,
 'ids': ['451022289330897624', '451022289330897625', '451022289330897626', '451022289330897627', '451022289330897628', '451022289330897629', '451022289330897630', '451022289330897631', '451022289330897632', '451022289330897633', '451022289330897634', '451022289330897635', '451022289330897636', '451022289330897637', '451022289330897638', '451022289330897639', '451022289330897640', '451022289330897641', '451022289330897642', '451022289330897643', '451022289330897644', '451022289330897645', '451022289330897646', '451022289330897647', '451022289330897648', '451022289330897649', '451022289330897650', '451022289330897651', '451022289330897652', '451022289330897653', '451022289330897654', '451022289330897655', '451022289330897656', '451022289330897657', '451022289330897658', '451022289330897659', '451022289330897660', '451022289330897661', '451022289330897662', '451022289330897663', '451022289330897664', '451022289330897665', '451022289330897666', '451022289330897667',