# Embed Text Data & Load Vectors into Milvus

First, import some common libraries and define the data reading functions.

In [3]:
# Import common libraries.
import time
import pandas as pd
import numpy as np

# Output words instead of scores.
def sentiment_score_to_name(score: float):
    if score > 0:
        return "Positive"
    elif score <= 0:
        return "Negative"

# Split data into train, valid, test. 
def partition_dataset(df_input, smoke_test=False):
    """Splits data, assuming original, input dataframe contains 50K rows.

    Args:
        df_input (pandas.DataFrame): input data frame
        smoke_test (boolean): if True, use smaller number of rows for testing
    
    Returns:
        df_train, df_val, df_test (pandas.DataFrame): train, valid, test splits.
    """

    # Shuffle data and split into train/val/test.
    df_shuffled = df_input.sample(frac=1, random_state=1).reset_index()
    # Add a corpus index.
    columns = ['movie_index', 'text', 'label_int', 'label']
    df_shuffled.columns = columns

    df_train = df_shuffled.iloc[:35_000]
    df_val = df_shuffled.iloc[35_000:40_000]
    df_test = df_shuffled.iloc[40_000:]

    # Save train/val/test split data locally in separate files.
    df_train.to_csv("train.csv", index=False, encoding="utf-8")
    df_val.to_csv("val.csv", index=False, encoding="utf-8")
    df_test.to_csv("test.csv", index=False, encoding="utf-8")

    return df_shuffled, df_train, df_val, df_test

## Start up a local Milvus server.

Code in this notebook uses [Milvus lite](https://milvus.io/docs/milvus_lite.md), which runs a local server.  ⛔️ Milvus lite is only meant for demos and local testing.
- pip install milvus pymilvus

💡 **For production purposes**, use a local Milvus docker, Milvus clusters, or fully-managed Milvus on Zilliz Cloud.
- [Local Milvus docker](https://milvus.io/docs/install_standalone-docker.md) requires local docker installed and running.
- [Milvus clusters](https://milvus.io/docs/install_cluster-milvusoperator.md) requires a K8s cluster up and running.
- [Ziliz Cloud free trial](https://cloud.zilliz.com/login) choose a "free" option when you provision.


In [4]:
from milvus import default_server, debug_server
from pymilvus import (
    connections, utility, FieldSchema, 
    DataType, CollectionSchema, Collection)

# Cleanup previous data and stop server in case it is still running.
default_server.stop()
default_server.cleanup()

# Start a new milvus-lite local server.
start_time = time.time()
default_server.start()

end_time = time.time()
print(f"startup time: {end_time - start_time}")
# startup time: 5.6739208698272705

# Add wait to avoid error message from trying to connect.
time.sleep(15)

# Now you could connect with localhost and the given port.
# Port is defined by default_server.listen_port.
connections.connect(host='127.0.0.1', 
                  port=default_server.listen_port,
                  show_startup_banner=True)

# Check if the server is ready.
print(utility.get_server_version())


TimeoutError: Milvus not startd in 180.0 seconds

**Create a Milvus collection**

You can think of a collection in Milvus like a "table" in SQL databases.  The **collection** will contain the 
- Schema
- Index to raw data segments
- Search index for efficient vector search

Some supported [data types](https://milvus.io/docs/schema.md) for Milvus schemas are:
- INT64 - primary key
- VARCHAR - raw texts
- FLOAT_VECTOR - embedings = list of `numpy.ndarray` of `numpy.float32` numbers

To finish specifying the schema, you'll need to get the vector `EMBEDDING_LENGTH` and `MAX_SEQ_LENGTH` parameters from your embedding model.

Transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed.  
- Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.

In [None]:
# Set the length of the embedding vectors.
EMBEDDING_LENGTH = 768

# Set the maximum length of the text sequences.
# This is used to pad/truncate the sequences, so they all have the same length.
# Usually set to the embedding model's sequence length, but milvus should not care.
MAX_SEQ_LENGTH = 65535 # milvus internal limit; our model has 512 limit

# Set the Milvus collection name.
COLLECTION_NAME = "movies"

# Create a collection with 6 fields.
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="movie_index", dtype=DataType.VARCHAR, max_length=8),
    FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=MAX_SEQ_LENGTH),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_LENGTH),
    FieldSchema(name="label_int", dtype=DataType.INT64),
    FieldSchema(name="label", dtype=DataType.VARCHAR, max_length=8),
]

schema = CollectionSchema(fields, "Search imdb movie reviews")
mc = Collection(COLLECTION_NAME, schema, consistency_level="Eventually")
print(f"Created collection: {COLLECTION_NAME}")

**Add a Search Index**

The search index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits.  Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different search indexes available on Milvus.  For example:
- FLAT - Hash index (deterministic exhaustive search)
- IVF_FLAT - Hash index (deterministic exhaustive search)
- HNSW - Graph index (stochastic approximate search)
- ANNOY - Tree index (stochastic approximate search)

Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered "close" in vector space.  In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen.  Its possible distance metrics are one of:
- L2 - L2-norm
- IP - Dot-product
- COSINE - Angular distance

In [None]:
# Define the type of search index to add to the collection.

# # Note: on Zilliz Cloud - Proprietary index is fastest!
# index_params = {
#     # Always set this to AUTOINDEX or just omit it.
#     "index_type": "AUTOINDEX", 
#     # Default to IP. This is the only parameter you need to think about.
#     "metric_type": "COSINE",
# }

# Drop the index, in case it already exists.
mc.drop_index()

# Create the search index for local Milvus server.
# Showing how to change the index parameters, instead of using defaults.
index_params = {
    "index_type": "HNSW", 
    "metric_type": "COSINE", 
    "params": {'M': 16,               # int. 4~64, num_layers
               "efConstruction": 32}  # int. 8~512, num_nearest_neighbors
               }
mc.create_index("embeddings", index_params);

## Read CSV data into a pandas dataframe

In [None]:
# Read locally stored data.
filepath = "data/movie_data.csv"

df = pd.read_csv(f"{filepath}")

# Drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# Change label column names.
df.columns = ['text', 'label_int']

# Map numbers to text 'Postive' and 'Negative' for sentiment labels.
df["label"] = df["label_int"].apply(sentiment_score_to_name)

# Split data into train/valid/test.
df, df_train, df_val, df_test = partition_dataset(df, smoke_test=False)
print(f"original df shape: {df.shape}")
print(f"df_train shape: {df_train.shape}, df_val shape: {df_val.shape}, df_test shape: {df_test.shape}")
assert df_train.shape[0] + df_val.shape[0] + df_test.shape[0] == df.shape[0]

# Inspect data.
print(f"Example text length: {len(df.text[0])}")
print(f"Example text: {df.text[0]}")
display(df.head(2))


In [None]:
# Check if approx. equal number training examples for each class.
class1 = df_train.loc[(df_train.label == "Positive"), :].copy()
class2 = df_train.loc[(df_train.label == "Negative"), :].copy()
print(f"Count samples positive: {class1.shape[0]}")
print(f"Count samples negative: {class2.shape[0]}")

## Load the Embedding Model checkpoint and use it to create vector embeddings
**Embedding model:**  We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) hosted on HuggingFace to encode the movie review text.  We will save the embeddings to a pandas dataframe and then into the milvus database.

**Chunking:** Before embedding, it is necessary to decide your chunk and chunk overlap sizes.  In this hello_world demo, I will use `MAX_SEQ_LENGTH` for chunk size.

Note:  To keep your tokens private, best practice is to use an env variable.   <br>
In Jupyter, need .env file (in same dir as notebooks) containing lines like this:
- VARIABLE_NAME=value

In [None]:
# Import torch.
import torch
from torch.nn import functional as F

# Initialize torch settings
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 415
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f"device: {DEVICE}")

import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
from huggingface_hub import login

# Login to huggingface_hub
hub_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
login(token=hub_token)


In [None]:
from sentence_transformers import SentenceTransformer

# Load the model from huggingface model hub.
model_name = "BAAI/bge-base-en-v1.5"
retriever = SentenceTransformer(model_name, device=DEVICE)
print(type(retriever))
print(retriever)

# Get the model parameters and save for later.
MAX_SEQ_LENGTH = retriever.get_max_seq_length() 
HF_EOS_TOKEN_LENGTH = 1
HF_TOKEN_EOS_LENGTH = 1
EMBEDDING_LENGTH = retriever.get_sentence_embedding_dimension()

# Inspect model parameters.
print(f"model_name: {model_name}")
print(f"EMBEDDING_LENGTH: {EMBEDDING_LENGTH}")
print(f"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = MAX_SEQ_LENGTH - HF_TOKEN_EOS_LENGTH
chunk_overlap = np.round(chunk_size * 0.10, 0)
print(f"chunk_size: {chunk_size}, overlap: {chunk_overlap}")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

def chunk_text(text):
    chunks = text_splitter.split_text(text)
    return [chunk for chunk in chunks if chunk]

**Only inserting 100 rows for demonstration purposes.**

This means the query results likely will not be very good.

In [None]:
# Prepare df for insertion into Milvus index.
# TODO - move all this to a utility function.

# Batch of data from pandas DataFrame.
batch = df.head(100).copy()  

# 1. Change primary key type to string.
batch["movie_index"] = batch["movie_index"].apply(lambda x: str(x))

# 2. Truncate reviews to 512 characters.
# batch["text"] = batch["text"].apply(lambda x: x[:MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH])
batch['chunk'] = batch['text'].apply(chunk_text)
# Explode the 'characters' column to create new rows for each character
batch = batch.explode('chunk', ignore_index=True)

# 3. Add embeddings as new column in df.
review_embeddings = torch.tensor(retriever.encode(batch['chunk']))
# Normalize embeddings to unit length.
review_embeddings = F.normalize(review_embeddings, p=2, dim=1)
# Quick check if embeddings are normalized.
norms = np.linalg.norm(review_embeddings, axis=1)
assert np.allclose(norms, 1.0, atol=1e-5) == True

# 4. Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.
converted_values = list(map(np.float32, review_embeddings))
batch['embeddings'] = converted_values

# 5. Reorder columns for conveneince, so index first, labels at end.
new_order = ["movie_index", "text", "chunk", "embeddings", "label_int", "label"]
batch = batch[new_order]

# Inspect data.
display(batch.head(2))
assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1
assert len(batch.embeddings[0]) == EMBEDDING_LENGTH
print(f"type embeddings: {type(batch.embeddings)} of {type(batch.embeddings[0])}")
print(f"of numbers: {type(batch.embeddings[0][0])}")

In [None]:
# Inspect chunking of the data.
print("original")
print(df.loc[(df.movie_index==931), "text"])
print("chunked")
checkit = list(batch.loc[(batch.movie_index=="931"), "chunk"])
[print(chunk) for chunk in checkit]

# Chunking looks good, drop the original text column.
batch.drop(columns=["text"], inplace=True)
batch.head(2)

## Insert data into Milvus

We can insert data directly from a pandas dataframe into Milvus.

In [None]:
# Insert data into the Milvus collection.
print("Start inserting entities")
insert_result = mc.insert(batch)

# After final entity is inserted, call flush to stop growing segments left in memory.
mc.flush() 

# Inspect results.
print(insert_result)
print(mc.partitions) # list[Partition] objects

## Run a Semantic Search

Now we can search all the movie review embeddings to find the `TOP_K` movie reviews with the closest embeddings to a user's question.
- In this example, we'll search for a movie recommendation for a medical doctor.

Note:  The same model should always be used for consistency for all the embeddings.

In [None]:
# Before conducting a search or a query, you need to load the data into memory.
mc.load()
print("Loaded milvus collection into memory.")

**Ask a question about your data**

Your custom data is mapped into a vector embedding space and those vector embeddings are saved into a vector database.  

Next, you can ask a question about your custom data!

In [None]:
# Define a sample question about your data.
query = "I'm a medical doctor, what movie should I watch?"

# Inspect the length of the query.
QUERY_LENGTH = len(query)
print(f"query length: {QUERY_LENGTH}")

**Embed the query using the same embedding model you used earlier**

In order for vector search to work, the query itself should be embedded with the same model used to create the colleciton you want to search.

In [None]:
# Embed the query using same embedding model used to create the Milvus collection.
query_embeddings = torch.tensor(retriever.encode([query]))
# Normalize embeddings to unit length.
query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
# Quick check if embeddings are normalized.
norms = np.linalg.norm(query_embeddings, axis=1)
assert np.allclose(norms, 1.0, atol=1e-5) == True

# Convert the embeddings to list of list of np.float32.
query_embeddings = list(map(np.float32, query_embeddings))

# Inspect data.
print(type(query_embeddings), len(query_embeddings), type(query_embeddings[0]))
print(type(query_embeddings[0][0]) ) 

## Execute a vector search

https://milvus.io/docs/search.md

In [None]:
# Execute a search.

# Return top k results with HNSW index.
TOP_K = 3
search_params = {
    "M": 16,
    "ef": 32,
}

start_time = time.time()
results = mc.search(
    data=query_embeddings, 
    anns_field="embeddings", 
    param=search_params,
    output_fields=["movie_index", "chunk", "label"], 
    limit=TOP_K,
    consistency_level="Eventually"
    )

elapsed_time = time.time() - start_time
print(elapsed_time)

print(f"type: {type(results)}, count: {len(results[0])}")


## Assemble and inspect the query result

The query result is in the variable `result[0]` of type `'pymilvus.orm.search.SearchResult'`.  

In [None]:
# get the distances to the query vector from all returned hits
distances = results[0].distances

# Get the movie_indexes, review texts, and labels.
texts = []
movie_indexes = []
labels = []
for result in results[0]:
    texts.append(result.entity.get("chunk"))
    movie_indexes.append(result.entity.get("movie_index"))
    # movie_indexes.append(0)
    labels.append(result.entity.get("label"))
    # labels.append("Positive")

# Assemble all the results in a zipped list.
print_info = list(zip(distances, movie_indexes, texts, labels))

In [None]:
# Print the results.
# k: distance, movie_index, label, review text

i = 0
for row in print_info:
    # print(row)
    print(f"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}")
    i += 1


In [None]:
# Shut down and cleanup the milvus server.
default_server.stop()
default_server.cleanup()

In [None]:
%load_ext watermark
%watermark -a 'Christy Bergman' -v -p torch,transformers,milvus,pymilvus,langchain --conda