In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection

pd.options.display.max_colwidth = 100

df = pd.read_csv("./data/train.csv", index_col="id")
sampled_df = df.sample(100)[["question1","question2"]]
concat_df = pd.concat([sampled_df["question1"],sampled_df["question2"]], axis=0)
sentences = concat_df.to_list()
sentences[:5]

  from .autonotebook import tqdm as notebook_tqdm


['Which coaching institute provides best distance learning program for 10th class?',
 'How much will the bank FD rate of interest decrease in India in future?',
 'What is the best coaching institute for GMAT in Delhi NCR region?',
 'Was Obama right to abstain from the UN vote on settlements?',
 'What are the best TV series one should really watch?']

In [2]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedding_model.encode(sentences)



In [3]:
# Establish a connection to the Milvus server
connections.connect(host="localhost",port=19530)

# Define the schema for the collection
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="sentences", dtype=DataType.VARCHAR, is_primary=False, description="The actual sentences", max_length=1000),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, is_primary=False, description="The sentence embeddings", dim=384)
]

schema = CollectionSchema(fields, "A collection to store sentence embeddings")

In [4]:
# Create the collection in Milvus
kaggle_collection = Collection("kaggle_collection", schema)
entities = [
    sentences,  # The actual sentences
    embeddings,  # The sentence embeddings
]

# Insert our data into the collection
insert_result = kaggle_collection.insert(entities)

# Create an index to make future search queries faster
index = {"index_type": "FLAT", "metric_type": "COSINE"}
kaggle_collection.create_index("embeddings", index)
kaggle_collection.load() # Load the data into memory

In [5]:
question = "What should i learn to be a programmer ?"
question_embedding = embedding_model.encode(question)

# Perform the search
results = kaggle_collection.search([question_embedding], "embeddings", search_params = {"metric_type": "COSINE"}, limit=3, output_fields=["sentences"],param={})

# Print the search results
for result in results:
    for value in result:
        print(f"{value.entity.get('sentences')} | score - {value.distance}")


Which programming language should I learn: Java or JavaScript? | score - 0.5187628865242004
Which programming language should I learn Java or python? | score - 0.485379159450531
How can I learn new things? | score - 0.4258279800415039


In [6]:
# utility.drop_collection("kaggle_collection")