# IMDB Vector Search using Milvus Client

First, import some common libraries and define the data reading functions.

In [1]:
# For colab install these libraries in this order:
# !pip install milvus, pymilvus, langchain, torch, transformers, python-dotenv

# Import common libraries.
import time
import pandas as pd
import numpy as np

# Output words instead of scores.
def sentiment_score_to_name(score: float):
    if score > 0:
        return "Positive"
    elif score <= 0:
        return "Negative"

# Split data into train, valid, test. 
def partition_dataset(df_input, smoke_test=False):
    """Splits data, assuming original, input dataframe contains 50K rows.

    Args:
        df_input (pandas.DataFrame): input data frame
        smoke_test (boolean): if True, use smaller number of rows for testing
    
    Returns:
        df_train, df_val, df_test (pandas.DataFrame): train, valid, test splits.
    """

    # Shuffle data and split into train/val/test.
    df_shuffled = df_input.sample(frac=1, random_state=1).reset_index()
    # Add a corpus index.
    columns = ['movie_index', 'text', 'label_int', 'label']
    df_shuffled.columns = columns

    df_train = df_shuffled.iloc[:35_000]
    df_val = df_shuffled.iloc[35_000:40_000]
    df_test = df_shuffled.iloc[40_000:]

    # Save train/val/test split data locally in separate files.
    df_train.to_csv("train.csv", index=False, encoding="utf-8")
    df_val.to_csv("val.csv", index=False, encoding="utf-8")
    df_test.to_csv("test.csv", index=False, encoding="utf-8")

    return df_shuffled, df_train, df_val, df_test

# Take as input a user query and conduct semantic vector search using the query.
def mc_search_imdb(query, search_params, top_k, milvus_client=False):

    # Embed the query using same embedding model used to create the Milvus collection.
    query_embeddings = torch.tensor(retriever.encode(query))
    # Normalize embeddings to unit length.
    query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
    # Quick check if embeddings are normalized.
    norms = np.linalg.norm(query_embeddings, axis=1)
    assert np.allclose(norms, 1.0, atol=1e-5) == True
    # Convert the embeddings to list of list of np.float32.
    query_embeddings = list(map(np.float32, query_embeddings))

    # Run semantic vector search using your query and the vector database.
    # Assemble results.
    distances = []
    texts = []
    movie_indexes = []
    labels = []
    if milvus_client:
        # MilvusClient search API call slightly different.
        results = mc.search(
            COLLECTION_NAME,
            data=query_embeddings, 
            search_params=search_params,
            output_fields=["movie_index", "chunk", "label"], 
            limit=top_k,
            consistency_level="Eventually",
            )
        # Results returned from MilvusClient are in the form list of lists of dicts.
        for result in results[0]:
            distances.append(result['distance'])
            texts.append(result['entity']['chunk'])
            movie_indexes.append(result['entity']['movie_index'])
            labels.append(result['entity']['label'])
    else:
        # Milvus server search API call.
        results = mc.search(
            data=query_embeddings, 
            anns_field="vector", 
            param=search_params,
            output_fields=["movie_index", "chunk", "label"], 
            limit=top_k,
            consistency_level="Eventually"
            )
        # Assemble results from Milvus server.
        distances = results[0].distances
        for result in results[0]:
            texts.append(result.entity.get("chunk"))
            movie_indexes.append(result.entity.get("movie_index"))
            labels.append(result.entity.get("label"))

    # Assemble all the results in a zipped list.
    formatted_results = list(zip(distances, movie_indexes, texts, labels))

    return formatted_results

## Start up a local Milvus server.

Code in this notebook uses [Milvus client](https://milvus.io/docs/using_milvusclient.md) with [Milvus lite](https://milvus.io/docs/milvus_lite.md), which runs a local server.  ⛔️ Milvus lite is only meant for demos and local testing.
- pip install milvus pymilvus

💡 **For production purposes**, use a local Milvus docker, Milvus clusters, or fully-managed Milvus on Zilliz Cloud.
- [Local Milvus docker](https://milvus.io/docs/install_standalone-docker.md) requires local docker installed and running.
- [Milvus clusters](https://milvus.io/docs/install_cluster-milvusoperator.md) requires a K8s cluster up and running.
- [Ziliz Cloud free trial](https://cloud.zilliz.com/login) choose a "free" option when you provision.


In [4]:
from milvus import default_server
from pymilvus import (
    connections, utility, 
    MilvusClient,
)

# Cleanup previous data and stop server in case it is still running.
default_server.stop()
default_server.cleanup()

# Start a new milvus-lite local server.
start_time = time.time()
default_server.start()

end_time = time.time()
print(f"Milvus server startup time: {end_time - start_time} sec")
# startup time: 5.6739208698272705

# Add wait to avoid error message from trying to connect.
time.sleep(15)

# Now you could connect with localhost and the given port.
# Port is defined by default_server.listen_port.
connections.connect(host='127.0.0.1', 
                  port=default_server.listen_port,
                  show_startup_banner=True)

# Check if the server is ready.
print(utility.get_server_version())

Milvus server startup time: 8.441025018692017 sec
v2.2-testing-20230824-68-ga34a9d606-lite


## Load the Embedding Model checkpoint and use it to create vector embeddings
**Embedding model:**  We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) hosted on HuggingFace to encode the movie review text.  We will save the embeddings to a pandas dataframe and then into the milvus database.

💡 Note:  To keep your tokens private, best practice is to use an env variable.   <br>
In Jupyter, need .env file (in same dir as notebooks) containing lines like this:
- VARIABLE_NAME=value

In [5]:
# Import torch.
import torch
from torch.nn import functional as F
from sentence_transformers import SentenceTransformer

# Initialize torch settings
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 415
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f"device: {DEVICE}")

import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
from huggingface_hub import login

# Login to huggingface_hub
hub_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
login(token=hub_token)

# Load the model from huggingface model hub.
model_name = "BAAI/bge-base-en-v1.5"
retriever = SentenceTransformer(model_name, device=DEVICE)
print(type(retriever))
print(retriever)

# Get the model parameters and save for later.
MAX_SEQ_LENGTH = retriever.get_max_seq_length() 
HF_EOS_TOKEN_LENGTH = 1
EMBEDDING_LENGTH = retriever.get_sentence_embedding_dimension()

# Inspect model parameters.
print(f"model_name: {model_name}")
print(f"EMBEDDING_LENGTH: {EMBEDDING_LENGTH}")
print(f"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}")

device: cpu
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/christybergman/.cache/huggingface/token
Login successful
<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
model_name: BAAI/bge-base-en-v1.5
EMBEDDING_LENGTH: 768
MAX_SEQ_LENGTH: 512


## Create a Milvus collection

You can think of a collection in Milvus like a "table" in SQL databases.  The **collection** will contain the 
- **Schema** (or no-schema Milvus Client).  
💡 You'll need the vector `EMBEDDING_LENGTH` parameter from your embedding model.
- **Vector index** for efficient vector search
- **Vector distance metric** for measuring nearest neighbor vectors
- **Consistency level**
In Milvus, transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed. 💡 Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.

## Add a Vector Index

The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits.  Most vector indexes use different sets of parameters depending on whether the database is:
- **inserting vectors** (creation mode) - vs - 
- **searching vectors** (search mode) 

Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus.  For example:
- FLAT - deterministic exhaustive search
- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)
- HNSW - Graph index (stochastic approximate search)

Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered "close" in vector space.  In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen.  Its possible distance metrics are one of:
- L2 - L2-norm
- IP - Dot-product
- COSINE - Angular distance

In [6]:
# Use embedding length from the embedding model.
print(f"Embedding length: {EMBEDDING_LENGTH}")

# Set the Milvus collection name.
COLLECTION_NAME = "movies"

# Define vector index algorithm params.
INDEX_PARAMS = dict({
    'M': 16,               # int. 4~64, num_neighbors, higher values takes more memory.
    "efConstruction": 32}  # int. 8~512, num_candidate_nearest_neighbors
    )

# Create the search index for local Milvus server.
# Show how to change the index parameters, instead of using defaults.
index_params = {
    "index_type": "HNSW", 
    "metric_type": "COSINE", 
    "params": INDEX_PARAMS
    }

# Use no-schema Milvus client (uses flexible json key:value format).
# https://milvus.io/docs/using_milvusclient.md
mc = MilvusClient(uri="http://localhost")
mc.drop_collection(COLLECTION_NAME)
mc.create_collection(COLLECTION_NAME, EMBEDDING_LENGTH, 
                                consistency_level="Eventually", 
                                auto_id=True,
                                overwrite=True,
                                params=index_params,)

# print("List collections:", mc.list_collections())
print(mc.describe_collection(COLLECTION_NAME))
print(f"Created collection: {COLLECTION_NAME}")

Embedding length: 768
{'collection_name': 'movies', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 768}}], 'aliases': [], 'collection_id': 445126122117267570, 'consistency_level': 3, 'properties': [], 'num_partitions': 1, 'enable_dynamic_field': True}
Created collection: movies


## Read CSV data into a pandas dataframe

The data used in this notebook is the [IMDB large movie review dataset](https://ai.stanford.edu/~amaas/data/sentiment/) from the Stanford AI Lab. It is a conveniently processed 50,000 dataset (50:50 sampled ratio Positive/Negative reviews). This data has columns: movie_index, raw review text, and movie rating.

In [None]:
# Download data.
# https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# citation:  ACL 2011, @InProceedings{maas-EtAl:2011:ACL-HLT2011,
#   author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
#   title     = {Learning Word Vectors for Sentiment Analysis},
#   booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
#   month     = {June},
#   year      = {2011},
#   address   = {Portland, Oregon, USA},
#   publisher = {Association for Computational Linguistics},
#   pages     = {142--150},
#   url       = {http://www.aclweb.org/anthology/P11-1015}
# }

# Cleanup:  move data file to data/ folder.

In [7]:
# Read locally stored data.
filepath = "data/movie_data.csv"

df = pd.read_csv(f"{filepath}")

# Drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# Change label column names.
df.columns = ['text', 'label_int']

# Map numbers to text 'Postive' and 'Negative' for sentiment labels.
df["label"] = df["label_int"].apply(sentiment_score_to_name)

# Split data into train/valid/test.
df, df_train, df_val, df_test = partition_dataset(df, smoke_test=False)
print(f"original df shape: {df.shape}")
print(f"df_train shape: {df_train.shape}, df_val shape: {df_val.shape}, df_test shape: {df_test.shape}")
assert df_train.shape[0] + df_val.shape[0] + df_test.shape[0] == df.shape[0]

# Inspect data.
print(f"Example text length: {len(df.text[0])}")
print(f"Example text: {df.text[0]}")
display(df.head(2))


original df shape: (49582, 4)
df_train shape: (35000, 4), df_val shape: (5000, 4), df_test shape: (9582, 4)
Example text length: 677
Example text: Fot the most part, this movie feels like a "made-for-TV" effort. The direction is ham-fisted, the acting (with the exception of Fred Gwynne) is overwrought and soapy. Denise Crosby, particularly, delivers her lines like she's cold reading them off a cue card. Only one thing makes this film worth watching, and that is once Gage comes back from the "Semetary." There is something disturbing about watching a small child murder someone, and this movie might be more than some can handle just for that reason. It is absolutely bone-chilling. This film only does one thing right, but it knocks that one thing right out of the park. Worth seeing just for the last 10 minutes or so.


Unnamed: 0,movie_index,text,label_int,label
0,26813,"Fot the most part, this movie feels like a ""ma...",0,Negative
1,26581,Are you kidding me? The music was SO LOUD in t...,0,Negative


In [8]:
# Check if approx. equal number training examples for each class.
class1 = df_train.loc[(df_train.label == "Positive"), :].copy()
class2 = df_train.loc[(df_train.label == "Negative"), :].copy()
print(f"Count samples positive: {class1.shape[0]}")
print(f"Count samples negative: {class2.shape[0]}")

Count samples positive: 17606
Count samples negative: 17394


## Chunking

Before embedding, it is necessary to decide your chunk strategy, chunk size, and chunk overlap.  In this demo, I will use:
- **Strategy** = Keep movie reveiws as single chunks unless they are too long.
- **Chunk size** = The embedding model's parameter `MAX_SEQ_LENGTH`
- **Overlap** = Rule-of-thumb 10-15%
- **Function** = Langchain's convenient `RecursiveCharacterTextSplitter` to split up long reviews recursively.


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Default chunk_size and overlap are calculated from embedding model parameters.
chunk_size = MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH

def chunk_text(text, chunk_size):

    # Default chunk overlap is 10% chunk_size.
    chunk_overlap = np.round(chunk_size * 0.10, 0)

    # Use langchain's convenient recursive chunking method.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    
    chunks = text_splitter.split_text(text)
    return [chunk for chunk in chunks if chunk]

**Demo batch size = 100 rows for demonstration purposes.**

This means the question results could be better with more data.

In [10]:
# Prepare df for insertion into Milvus index.

# Batch of data from pandas DataFrame.
BATCH_SIZE = 100
batch = df.head(BATCH_SIZE).copy()
print(f"original shape: {batch.shape}")
start_time = time.time()

# 1. Change primary key type to string.
batch["movie_index"] = batch["movie_index"].apply(lambda x: str(x))

# 2. Truncate reviews to 512 characters.
# Naive approach, just truncate to 512 characters.
# batch["text"] = batch["text"].apply(lambda x: x[:MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH])

# Better approach, use LangChain's utility function that adds chunk overlaps.

###############
## EXERCISE #1: Change NEW_CHUNK_SIZE to 256 below.  How many chunks (vectors) does this create?
## BONUS:       Can you explain why the default chunk_size creates a single chunk per review?
###############
NEW_CHUNK_SIZE = None 
if NEW_CHUNK_SIZE is not None:
    chunk_size = NEW_CHUNK_SIZE
print(f"chunk_size: {chunk_size}")
## END EXERCISE #1
## ANSWER:  542

batch['chunk'] = batch['text'].apply(chunk_text, chunk_size=chunk_size)
# Explode the 'chunk' column to create new rows for each chunk.
batch = batch.explode('chunk', ignore_index=True)
print(f"new shape: {batch.shape}")

# 3. Add embeddings as new column in df.
review_embeddings = torch.tensor(retriever.encode(batch['chunk']))
# Normalize embeddings to unit length.
review_embeddings = F.normalize(review_embeddings, p=2, dim=1)
# Quick check if embeddings are normalized.
norms = np.linalg.norm(review_embeddings, axis=1)
assert np.allclose(norms, 1.0, atol=1e-5) == True

# 4. Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.
converted_values = list(map(np.float32, review_embeddings))
batch['vector'] = converted_values

# 5. Reorder columns for conveneince, so index first, labels at end.
new_order = ["movie_index", "text", "chunk", "vector", "label_int", "label"]
batch = batch[new_order]

end_time = time.time()
print(f"Chunking + embedding time for {BATCH_SIZE} docs: {end_time - start_time} sec")

# Inspect data.
display(batch.head())
assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1
assert len(batch.vector[0]) == EMBEDDING_LENGTH
print(f"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}")
print(f"of numbers: {type(batch.vector[0][0])}")

# Chunking looks good, drop the original text column.
batch.drop(columns=["text"], inplace=True)

original shape: (100, 4)
chunk_size: 511
new shape: (290, 5)
Chunking + embedding time for 100 docs: 9.24232006072998 sec


Unnamed: 0,movie_index,text,chunk,vector,label_int,label
0,26813,"Fot the most part, this movie feels like a ""ma...","Fot the most part, this movie feels like a ""ma...","[-0.022307869, -0.038372956, -0.005369567, -0....",0,Negative
1,26813,"Fot the most part, this movie feels like a ""ma...",more than some can handle just for that reason...,"[0.02946616, -0.024044147, -0.011064137, -0.03...",0,Negative
2,26581,Are you kidding me? The music was SO LOUD in t...,Are you kidding me? The music was SO LOUD in t...,"[-0.016822321, -0.030674767, -0.041740056, 0.0...",0,Negative
3,26581,Are you kidding me? The music was SO LOUD in t...,And what does a Kansas teen know about shoppin...,"[0.035922922, -0.06197654, 0.008055181, -0.025...",0,Negative
4,40633,"First of all, I don't understand why some peop...","First of all, I don't understand why some peop...","[-0.0035528215, -0.042889904, -0.04559665, 0.0...",1,Positive


type embeddings: <class 'pandas.core.series.Series'> of <class 'numpy.ndarray'>
of numbers: <class 'numpy.float32'>


## Insert data into Milvus

We can insert a batch of data directly from a pandas dataframe into Milvus.

🤔 TODO: This would be a good place to demonstrate Milvus' scalability by using Ray together with Milvus to run batches in parallel. I'll do this in a future tutorial.

In [11]:
# Insert a batch of data into the Milvus collection.

# Convert DataFrame to a list of dictionaries
dict_list = []
for _, row in batch.iterrows():
    dictionary = row.to_dict()
    dict_list.append(dictionary)

print("Start inserting entities")
start_time = time.time()
insert_result = mc.insert(
    COLLECTION_NAME,
    data=dict_list, 
    progress_bar=True)
end_time = time.time()
print(f"Milvus insert time for {batch.shape[0]} vectors: {end_time - start_time} seconds")

# After final entity is inserted, call flush to stop growing segments left in memory.
mc.flush(COLLECTION_NAME) 

# Inspect results.
print(insert_result)


Start inserting entities


100%|██████████| 1/1 [00:00<00:00, 31.68it/s]

Milvus insert time for 290 vectors: 0.03363490104675293 seconds





[445126122117267655, 445126122117267656, 445126122117267657, 445126122117267658, 445126122117267659, 445126122117267660, 445126122117267661, 445126122117267662, 445126122117267663, 445126122117267664, 445126122117267665, 445126122117267666, 445126122117267667, 445126122117267668, 445126122117267669, 445126122117267670, 445126122117267671, 445126122117267672, 445126122117267673, 445126122117267674, 445126122117267675, 445126122117267676, 445126122117267677, 445126122117267678, 445126122117267679, 445126122117267680, 445126122117267681, 445126122117267682, 445126122117267683, 445126122117267684, 445126122117267685, 445126122117267686, 445126122117267687, 445126122117267688, 445126122117267689, 445126122117267690, 445126122117267691, 445126122117267692, 445126122117267693, 445126122117267694, 445126122117267695, 445126122117267696, 445126122117267697, 445126122117267698, 445126122117267699, 445126122117267700, 445126122117267701, 445126122117267702, 445126122117267703, 445126122117267704,

## Run a Semantic Search

Now we can search all the movie review embeddings to find the `TOP_K` movie reviews with the closest embeddings to a user's query.
- In this example, we'll search for a movie recommendation for a medical doctor.

💡 The same model should always be used for consistency for all the embeddings.

In [12]:
# # Before conducting a search based on a query, you need to load the data into memory.
# mc.load()
# print("Loaded milvus collection into memory.")

## Ask a question about your data

So far in this demo notebook: 
1. Your custom data has been mapped into a vector embedding space
2. Those vector embeddings have been saved into a vector database

Next, you can ask a question about your custom data!

💡 In LLM lingo:
> **Query** is the generic term for user questions.  
A query is a list of multiple individual questions, up to maybe 1000 different questions!

> **Question** usually refers to a single user question.  
In our example below, the user question is "I'm a medical doctor, what movie should I watch?"

In [13]:
# Define a sample question about your data.
question = "I'm a medical doctor, what movie should I watch?"
query = [question]

# Inspect the length of the query.
QUERY_LENGTH = len(query[0])
print(f"query length: {QUERY_LENGTH}")

query length: 48


**Embed the question using the same embedding model you used earlier**

In order for vector search to work, the question itself should be embedded with the same model used to create the colleciton you want to search.

In [14]:
# Embed the query using same embedding model used to create the Milvus collection.
query_embeddings = torch.tensor(retriever.encode(query))
# Normalize embeddings to unit length.
query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
# Quick check if embeddings are normalized.
norms = np.linalg.norm(query_embeddings, axis=1)
assert np.allclose(norms, 1.0, atol=1e-5) == True

# Convert the embeddings to list of list of np.float32.
query_embeddings = list(map(np.float32, query_embeddings))

# Inspect data.
print(type(query_embeddings), len(query_embeddings), type(query_embeddings[0]))
print(type(query_embeddings[0][0]) ) 

<class 'list'> 1 <class 'numpy.ndarray'>
<class 'numpy.float32'>


## Execute a vector search

Search Milvus using [PyMilvus API](https://milvus.io/docs/search.md).

💡 By their nature, vector searches are "semantic" searches.  For example, if you were to search for "leaky faucet": 
> **Traditional Key-word Search** - either or both words "leaky", "faucet" would have to match some text in order to return a web page or link text to the document.

> **Semantic search** - results containing words "drippy" "taps" would be returned as well because these words mean the same thing even though they are different words,

In [15]:
# Execute a search.

# Return top k results with HNSW index.
TOP_K = 3
SEARCH_PARAMS = dict({
    # Re-use index param for num_candidate_nearest_neighbors.
    "ef": INDEX_PARAMS['efConstruction']
    })

# Run semantic vector search using your query and the vector database.
start_time = time.time()
results = mc.search(
    COLLECTION_NAME,
    data=query_embeddings, 
    search_params=SEARCH_PARAMS,
    output_fields=["movie_index", "chunk", "label"], 
    limit=TOP_K,
    consistency_level="Eventually",
    )

elapsed_time = time.time() - start_time
print(f"Search time: {elapsed_time} sec")

# Inspect search result.
print(f"type: {type(results)}, count: {len(results[0])}")


Search time: 0.005711078643798828 sec
type: <class 'list'>, count: 3


## Assemble and inspect the search result

The search result is in the variable `result[0]` of type `'pymilvus.orm.search.SearchResult'`.  

In [16]:
## Results returned from MilvusClient are in the form list of lists of dicts.

# Get the movie_indexes, review texts, and labels.
distances = []
texts = []
movie_indexes = []
labels = []
for result in results[0]:
    distances.append(result['distance'])
    texts.append(result['entity']['chunk'])
    movie_indexes.append(result['entity']['movie_index'])
    labels.append(result['entity']['label'])

# Assemble all the results in a zipped list.
formatted_results = list(zip(distances, movie_indexes, texts, labels))

In [17]:
# Print the results.
# k: distance, movie_index, label, review text

i = 0
for row in formatted_results:
    print(f"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}")
    i += 1

#1:  2006, Serum, 
# 0: 0.541, 931, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b
# 1: 0.54, 20682, Positive, is not a horror movie, although it does contain some violent scenes, but is rather a comedy. A satir
# 2: 0.535, 12529, Positive, a good movie with a real good story. The fact that there are so many other big stars who


0: 0.541, 931, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b
1: 0.54, 20682, Positive, is not a horror movie, although it does contain some violent scenes, but is rather a comedy. A satir
2: 0.535, 12529, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha


## Try another question

This time just add the words **only good movies** to the question, see if the answers are any different?  

For semantically different questions, we expect the answers to be different.

To make the code easier to read, this time I'll just use the convenience function I defined at the top of this notebook.

In [18]:
# Take as input a user question and conduct semantic vector search using the question.
question = "I'm a medical doctor, what movie should I watch?"
new_question = "I'm a medical doctor, suggest only good movies to watch?"
new_results = mc_search_imdb([new_question], SEARCH_PARAMS, 3, milvus_client=True)

# Print the results.
# k: distance, movie_index, label, review text
i = 0
for row in new_results:
    print(f"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}")
    i += 1

# As expected, new_question answers are slightly different!
# 0: 0.562, 45719, Positive, the stories but helps Malkovich to provoke some thought.<br /><br />I'd say it is worth seeing and t
# 1: 0.562, 21791, Positive, to add that the dog (who's a pretty darn good actor himself!) comes in a close second.<br /><br />Al
# 2: 0.561, 12529, Positive, a good movie with a real good story. The fact that there are so many other big

0: 0.562, 45719, Positive, the stories but helps Malkovich to provoke some thought.<br /><br />I'd say it is worth seeing and t
1: 0.562, 21791, Positive, to add that the dog (who's a pretty darn good actor himself!) comes in a close second.<br /><br />Al
2: 0.561, 12529, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha


In [19]:
# Shut down and cleanup the milvus server.
default_server.stop()
default_server.cleanup()

In [20]:
%load_ext watermark
%watermark -a 'Christy Bergman' -v -p torch,transformers,milvus,pymilvus,langchain --conda

Author: Christy Bergman

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.15.0

torch       : 2.0.1
transformers: 4.33.2
milvus      : 2.3.0
pymilvus    : 2.3.0
langchain   : 0.0.301

conda environment: py310

