# Store data to Milvus in watsonx.data

In order to run a similarity search on text the data stored in Presto has to be converted to vectors and stored in Milvus

### Initialize configuration

In [None]:
import sys
sys.path.append("../../utils")
import wxd_utils

conf=wxd_utils.load_conf()
print(conf)

### Connect watsonx.data

In [None]:
wxd_engine = wxd_utils.connect_wxd(conf)

#### Create Milvus Collection & Index

Creating a Milvus collection involves first connecting to the Milvus server, then creating a collection with a defined schema and index. 

In [None]:
from pymilvus import(
    Milvus,
    IndexType,
    Status,
    connections,
    FieldSchema,
    DataType,
    Collection,
    CollectionSchema,
)

connections.connect(alias = 'default',
                host = conf["host"],
                port = conf["milvus_port"],
                user = conf["user"],
                password = conf["password"],
                server_pem_path = conf["lh_cert"],
                server_name = conf["host"],
                secure = True)

In [None]:
# Create collection - define fields + schema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # Primary key
    FieldSchema(name="article_text", dtype=DataType.VARCHAR, max_length=2500,),
    FieldSchema(name="article_title", dtype=DataType.VARCHAR, max_length=200,),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),
]

schema = CollectionSchema(fields, "wikipedia article collection schema")

wiki_collection = Collection("wiki_articles", schema)

# Create index
index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":2048}
}

wiki_collection.create_index(field_name="vector", index_params=index_params)

Status(code=0, message=) means success! 

### Check to see the collections in our milvus instance and we see 'wiki_articles'  has been created 


In [None]:
from pymilvus import utility
utility.list_collections()

### Generate Vectors and insert them into Milvus

Here we read data from the Presto table using the connection we created earlier. We pull text chunks and titles from the database. We then vectorize using the `sentence-transformers/all-MiniLM-L6-v2` sentence transformer model. Learn more about Hugging Face sentence transformers here: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

It is important we assemble the article text, article titles and vector embeddings into a `data` object. This object will be used to load the data into Milvus.

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from pymilvus import Collection, connections
import warnings
warnings.filterwarnings('ignore')

# Download Wikipedia articles from watsonx.data using the engine we created earlier 

articles_df = pd.read_sql_query("select * from iceberg_data.simple_rag.wikipedia", wxd_engine)

# extract text + titles
passages = articles_df['text'].tolist()
passage_titles = articles_df['title'].tolist()

# Create vector embeddings + data
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # 384 dim
passage_embeddings = model.encode(passages)

basic_collection = Collection("wiki_articles") 
data = [
    passages,
    passage_titles,
    passage_embeddings
]
out = basic_collection.insert(data)
basic_collection.flush()  # Ensures data persistence

### Check to ensure entities have been loaded into 'wiki_articles' collection

In [None]:
basic_collection = Collection("wiki_articles") 
basic_collection.num_entities 

### If needed, delete the content of the collection (change condition to do so)

In [None]:
if False:
    basic_collection = Collection("wiki_articles") 
    basic_collection.drop()