In [None]:
! python -m pip install --upgrade pymilvus langchain openai tiktoken pymongo sentence-transformers


## Milvus server setup

need to install and turn on the milvus server before running this code snippet.

tutorial to install/run/stop milvus server: 
https://milvus.io/docs/install_standalone-docker.md

In [2]:
from os import environ

MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

## Load data via MongoDB (with the ColAI client)

In [188]:
import pymongo
client = pymongo.MongoClient(
    "mongodb+srv://ColAccess:ColAI2023@crawl.xk12nv0.mongodb.net/?retryWrites=true&w=majority"
)
client.list_database_names()

['Crawl', 'admin', 'local']

In [189]:
db = client['Crawl']
db.list_collection_names()

['metadata']

In [190]:
collection = db['metadata']
files = list(collection.find())
files

[{'_id': ObjectId('64bb7c64bc23363224cb6651'),
  'id': 'newspaper+and+magazine+images+segmentation+dataset',
  'url': 'https://archive.ics.uci.edu/dataset/306/newspaper+and+magazine+images+segmentation+dataset',
  'Title': 'Newspaper and magazine images segmentation dataset',
  'Description': ' Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.',
  'Dataset Characteristics': '-',
  'Subject Area': 'Computer',
  'Associated Tasks': 'Classification',
  'Attribute Type': '-',
  '# Instances': '101',
  '# Attributes': '-'},
 {'_id': ObjectId('64bb7c65bc23363224cb6652'),
  'id': 'rice+cammeo+and+osmancik',
  'url': 'https://archive.ics.uci.edu/dataset/545/rice+cammeo+and+osmancik',
  'Title': 'Rice (Cammeo and Osmancik)',
  'Description': " A total of 3810 rice grain's images were taken for the two species, processed and feature inferences were made. 7 morphological features wer

In [191]:
collection.count_documents({})

624

In [192]:
files[0].keys()

dict_keys(['_id', 'id', 'url', 'Title', 'Description', 'Dataset Characteristics', 'Subject Area', 'Associated Tasks', 'Attribute Type', '# Instances', '# Attributes'])

## Convert files to `Ducument` type

'Document' must be defined in the fashion same as the data type returns by WebBaseLoader from langchain, 

where 'metadata' must include key 'source', 'title', 'description', and 'language' (all in lower case),

still figuring out a way to make it more flexible

In [193]:
from collections import namedtuple

Document = namedtuple("Document", ["page_content", "metadata"])
docs = [
    Document(file['Description'], {"Subject Area": file['Subject Area'], "title": file['Title'], "source": file['url'], "description": file['Description'], "language": "en"})
    for file in files
]

In [194]:
docs[0].page_content

' Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.'

## Split texts

In [195]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
docs_split = text_splitter.split_documents(docs)

In [196]:
docs_split

[Document(page_content='Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.', metadata={'Subject Area': 'Computer', 'title': 'Newspaper and magazine images segmentation dataset', 'source': 'https://archive.ics.uci.edu/dataset/306/newspaper+and+magazine+images+segmentation+dataset', 'description': ' Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.', 'language': 'en'}),
 Document(page_content="A total of 3810 rice grain's images were taken for the two species, processed and feature inferences were made. 7 morphological features were obtained for each grain of rice.", metadata={'Subject Area': 'Computer', 'title': 'Rice (Cammeo and Osmancik)', 'source': 'https://archive.ics.uci.edu/dataset/545/rice+cammeo+and+osmancik', 'description': " A total of 3810 rice grain'

## Create embedding model

In [199]:
# !pip install -q sentence-transformers
# !pip install -q langchain

from langchain.embeddings import HuggingFaceEmbeddings
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2" # all-MiniLM-L6-v2: structure of SBert
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embed_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

## Create vector store

In [200]:
# note the `embedding` argument is not the embedded value but the embedding model

from langchain.vectorstores import Milvus

vector_store = Milvus.from_documents(
    docs_split,
    embedding=embed_model,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [201]:
vector_store

<langchain.vectorstores.milvus.Milvus at 0x7f99fe397040>

## Similarity Searches

In [202]:
# Try text-to-text similarity searches 

query1 = "Can I find a dataset of newspapers and magazines in Russian?"
docs1 = vector_store.similarity_search(query1)

print(docs1)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content='Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.', metadata={'source': 'https://archive.ics.uci.edu/dataset/306/newspaper+and+magazine+images+segmentation+dataset', 'title': 'Newspaper and magazine images segmentation dataset', 'description': ' Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.', 'language': 'en'}), Document(page_content='Dataset is well suited for segmentation tasks. It contains 101 scanned pages from different newspapers and magazines in Russian with ground truth pixel-based masks.', metadata={'source': 'https://archive.ics.uci.edu/dataset/306/newspaper+and+magazine+images+segmentation+dataset', 'title': 'Newspaper and magazine images segmentation dataset', 'description': ' Dataset is well suited for segmentation task

In [203]:
# Try text-to-text similarity searches 

query2 = "Can I find a dataset of old films?"
docs2 = vector_store.similarity_search(query2)

print(docs2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content='This data set contains a list of over 10000 films including many older, odd, and cult films. There is information on actors, casts, directors, producers, studios, etc.', metadata={'source': 'https://archive.ics.uci.edu/dataset/132/movie', 'title': 'Movie', 'description': ' This data set contains a list of over 10000 films including many older, odd, and cult films. There is information on actors, casts, directors, producers, studios, etc.', 'language': 'en'}), Document(page_content='This data set contains a list of over 10000 films including many older, odd, and cult films. There is information on actors, casts, directors, producers, studios, etc.', metadata={'source': 'https://archive.ics.uci.edu/dataset/132/movie', 'title': 'Movie', 'description': ' This data set contains a list of over 10000 films including many older, odd, and cult films. There is information on actors, casts, directors, producers, studios, etc.', 'language': 'en'}), Document(page_content='Th

## ---------------------------- End of file ----------------------------

## Appendix: WebBaseLoader in the tutorial example

tutorial: https://milvus.io/docs/integrate_with_langchain.md

In [139]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader([
    "https://milvus.io/docs/overview.md",
])

docs_example = loader.load()

In [93]:
type(docs_example)

list

In [53]:
docs_example

[Document(page_content="Introduction Milvus documentationDocsTutorialsToolsBlogCommunityStars0Join SlackTry Managed Milvus FREESearchHomev2.2.x\u200bAbout MilvusWhat is MilvusMilvus AdoptersMilvus RoadmapMilvus LimitsReleasesEnhancement ProposalsBootcampGet StartedUser GuideAdministration GuideIntegrationsBenchmarksToolsReferenceExample ApplicationsFAQsAPI referenceIntroduction\nThis page aims to give you an overview of Milvus by answering several questions. After reading this page, you will learn what Milvus is and how it works, as well as the key concepts, why use Milvus, supported indexes and metrics, example applications, the architecture, and relevant tools.\nWhat is Milvus vector database?\nMilvus was created in 2019 with a singular goal: store, index, and manage massive embedding vectors generated by deep neural networks and other machine learning (ML) models.\nAs a database specifically designed to handle queries over input vectors, it is capable of indexing vectors on a trilli

In [142]:
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
docs_example = text_splitter.split_documents(docs_example)

Created a chunk of size 1743, which is longer than the specified 1024
Created a chunk of size 1278, which is longer than the specified 1024


In [143]:
docs_example

[Document(page_content='Introduction Milvus documentationDocsTutorialsToolsBlogCommunityStars0Join SlackTry Managed Milvus FREESearchHomev2.2.x\u200bAbout MilvusWhat is MilvusMilvus AdoptersMilvus RoadmapMilvus LimitsReleasesEnhancement ProposalsBootcampGet StartedUser GuideAdministration GuideIntegrationsBenchmarksToolsReferenceExample ApplicationsFAQsAPI referenceIntroduction\nThis page aims to give you an overview of Milvus by answering several questions. After reading this page, you will learn what Milvus is and how it works, as well as the key concepts, why use Milvus, supported indexes and metrics, example applications, the architecture, and relevant tools.\nWhat is Milvus vector database?\nMilvus was created in 2019 with a singular goal: store, index, and manage massive embedding vectors generated by deep neural networks and other machine learning (ML) models.\nAs a database specifically designed to handle queries over input vectors, it is capable of indexing vectors on a trilli

In [128]:
# Embedding

# embeddings = OpenAIEmbeddings(model="ada")

!pip install -q sentence-transformers
!pip install -q langchain

from langchain.embeddings import HuggingFaceEmbeddings
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2" # all-MiniLM-L6-v2: structure of SBert
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embed_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [167]:
vector_store = Milvus.from_documents(
    docs_example,
    embedding=embed_model,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [163]:
# Try text-to-text similarity searches 

query = "What is milvus?"
docs = vector_store.similarity_search(query)

print(docs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(page_content='Milvus workflow.', metadata={'source': 'https://milvus.io/docs/overview.md', 'title': 'Introduction Milvus documentation', 'description': 'Milvus is an open-source vector database designed specifically for AI application development, embeddings similarity search, and MLOps v2.2.x.', 'language': 'en'}), Document(page_content='Milvus workflow.', metadata={'source': 'https://milvus.io/docs/overview.md', 'title': 'Introduction Milvus documentation', 'description': 'Milvus is an open-source vector database designed specifically for AI application development, embeddings similarity search, and MLOps v2.2.x.', 'language': 'en'}), Document(page_content="Installation Prerequisites\nInstall Milvus Standalone\n\n\nIf you're interested in diving deep into the design details of Milvus:\n\nRead about Milvus architecture\n\n\nEdit this pageReport a bugRequest doc changesOn this pageIntroductionWhat is Milvus vector database?Key conceptsWhy Milvus?What indexes and metrics are s