In [8]:
# 문장 임베딩 후, milvus vector DB 저장

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer
import pandas as pd

connections.connect("default", host="localhost", port="19530")

model_name = "jhgan/ko-sroberta-multitask"
embedder = SentenceTransformer(model_name)

file_path = "path-to-your-csv-file"
df = pd.read_csv(file_path, encoding="cp949")

df["danji_id"] = pd.to_numeric(df["danji_id"], errors="coerce").astype("Int64")
df = df.dropna(subset=["danji_id"])

df["desc"] = df["concateDesc"] + " " + df["trafficDesc"] + " " + df["aroundDesc"] + " " + df["careDesc"] + " " + df["residentDesc"]

corpus = df["desc"].tolist()
danji_ids = df["danji_id"].tolist()

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=False)

fields = [
    FieldSchema(name="danji_id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=len(corpus_embeddings[0]))
]

schema = CollectionSchema(fields, description="Danji Embeddings Collection")

if "danji_embeddings" in utility.list_collections():
    collection = Collection("danji_embeddings")
    collection.drop()

collection = Collection("danji_embeddings", schema)

batch_size = 1000
for i in range(0, len(corpus_embeddings), batch_size):
    batch_embeddings = corpus_embeddings[i:i + batch_size]
    batch_danji_ids = danji_ids[i:i + batch_size]
    data_to_insert = [batch_danji_ids, [embedding.tolist() for embedding in batch_embeddings]]
    collection.insert(data_to_insert)
    print(f"Inserted batch {i // batch_size + 1}")

index_params = {
    "metric_type": "IP",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}
collection.create_index(field_name="embedding", index_params=index_params)

collection.load()
print("Embeddings have been inserted into Milvus and indexed.")


Inserted batch 1
Inserted batch 2
Inserted batch 3
Inserted batch 4
Inserted batch 5
Inserted batch 6
Inserted batch 7
Inserted batch 8
Inserted batch 9
Inserted batch 10
Inserted batch 11
Inserted batch 12
Inserted batch 13
Inserted batch 14
Inserted batch 15
Inserted batch 16
Inserted batch 17
Inserted batch 18
Inserted batch 19
Inserted batch 20
Inserted batch 21
Inserted batch 22
Inserted batch 23
Inserted batch 24
Inserted batch 25
Inserted batch 26
Inserted batch 27
Inserted batch 28
Inserted batch 29
Inserted batch 30
Inserted batch 31
Inserted batch 32
Inserted batch 33
Inserted batch 34
Inserted batch 35
Embeddings have been inserted into Milvus and indexed.


In [22]:
# 검색 예제

def find_similar_danjis(query, top_k=5):
    query_embedding = embedder.encode(query, convert_to_tensor=False).tolist()

    search_params = {"metric_type": "IP"}
    results = collection.search(
        data=[query_embedding],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=["danji_id"]
    )

    print(f"\nQuery: {query}")
    print("\nTop {} most similar danji_ids:".format(top_k))

    danji_id_list = []
    for result in results[0]:
        danji_id = result.entity.get("danji_id")
        if danji_id not in danji_id_list:
            danji_id_list.append(danji_id)

    print(danji_id_list)
    return danji_id_list

query = "강아지 산책시키기 좋은 아파트"
result = find_similar_danjis(query, top_k=16384)



Query: 강아지 산책시키기 좋은 아파트

Top 16384 most similar danji_ids:
[663, 23162, 38011, 1698, 12411, 5677, 21707, 1938, 22138, 11271, 22843, 22169, 4768, 50212, 14532, 11228, 25242, 19874, 11561, 7565, 37272, 22005, 11046, 8720, 33247, 1713, 33304, 12018, 18025, 1505, 19019, 33279, 12065, 19782, 827, 10143, 7006, 6335, 795, 4386, 19271, 36264, 698, 5952, 10134, 17840, 19867, 1163, 21939, 37617, 19673, 702, 14538, 11922, 6314, 22994, 22151, 22723, 16985, 19027, 3092, 16245, 33499, 20786, 5969, 15586, 18970, 23690, 16449, 9107, 20292, 6750, 21931, 18250, 2778, 17000, 22663, 3983, 15985, 1131, 17488, 3240, 697, 19967, 37357, 20263, 27617, 19542, 8313, 9995, 19411, 2777, 694, 1531, 33347, 6069, 1112, 17295, 2566, 7740, 1899, 5998, 5968, 33708, 19829, 6218, 1371, 21840, 18077, 3520, 9840, 15498, 1018, 2863, 14529, 34545, 594, 25257, 21709, 34131, 90, 12200, 1350, 15981, 6712, 9037, 18540, 1633, 140, 2958, 36793, 23427, 8566, 6006, 437, 18971, 7781, 24032, 2845, 37363, 24402, 7138, 1391, 18080, 6543