# Load data

Load chunked context data embedding vectors into an indexed database.  We will use open source Milvus.

Demo in progress...

We need to first download the Milvus documentation to a local directory:

In [1]:
DOCS_PAGE="https://pymilvus.readthedocs.io/en/latest"
LOCAL_DIR="~/Documents/christy_coding_scratch/data/milvus_documentation"
!echo $DOCS_PAGE
!echo $LOCAL_DIR

# !wget -r -A.html $DOCS_PAGE -P $LOCAL_DIR

https://pymilvus.readthedocs.io/en/latest
/Users/christybergman/Documents/christy_coding_scratch/data/milvus_documentation


In [28]:
# Import common libraries.
import time, os
import numpy as np

# Import langchain.
#!pip install langchain html2text unstructured
# import html2text, unstructured
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Load Data
# loader = ReadTheDocsLoader(DOCS_PAGE)
loader = UnstructuredURLLoader(urls=[DOCS_PAGE])
data = loader.load()

print(f"loaded {len(data)} documents")
print(f"type: {type(data)}, len: {len(data)}, type: {type(data[0])}")

loaded 1 documents
type: <class 'list'>, len: 1, type: <class 'langchain.schema.document.Document'>


In [27]:
# Upload data to vectorstore.
# embeddings = OpenAIEmbeddings()
# vectorstore = FAISS.from_documents(documents, embeddings)

## Chunk and embed data

**First, choose an embedding model** <br>
Most tutorials default to the OpenAI embedding model, which costs money.  You don't have to do that.

In the code below, we will use an open source SentenceTransformer embedding model, hosted on HuggingFace.

In [25]:
# Import torch.
import torch

# Initialize torch settings
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 415
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f"device: {DEVICE}")

from sentence_transformers import SentenceTransformer

# load the retriever model from huggingface model hub
model_name = "all-MiniLM-L12-v2"
retriever = SentenceTransformer(model_name, device=DEVICE)
print(type(retriever))
print(retriever)

# Save params for later.
# TOKENIZER_CONTEXT_WINDOW = retriever.get_max_seq_length()
CHUNK_SIZE = retriever.get_sentence_embedding_dimension()
chunk_overlap = np.round(CHUNK_SIZE * 0.15, 0)
print(f"embedding vector length: {CHUNK_SIZE}, chunk_overlap: {chunk_overlap}")
# 384, 58

device: cpu
<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)
embedding vector length: 384, chunk_overlap: 58.0


In [26]:
# Chunk the data using Langchain's HTML splitter
start_time = time.time()

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=CHUNK_SIZE,
    chunk_overlap=chunk_overlap,
    length_function=len)
documents = text_splitter.split_documents(data)
print(f"type: {type(documents)}, len: {len(data)}, type: {type(data[0])}")

end_time = time.time()
print(f"chunking time: {end_time - start_time}")

type: <class 'list'>, len: 1, type: <class 'langchain.schema.document.Document'>
chunking time: 0.003075122833251953


Log into HuggingFace using your [API token](https://huggingface.co/settings/tokens). 

💡 Best practice:  Read tokens from your environment. <br>
> Some people choose to create an input field, for user to type their token. <br>
> Either way, never hard-code your token into public code! 

In [31]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
from huggingface_hub import login

# Login to huggingface_hub
hub_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
login(token=hub_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/christybergman/.cache/huggingface/token
Login successful


In [None]:
# Embed chunks in batches of 100 chunks at a time.
# TODO - move all this to a utility function.

# Batch of data from pandas DataFrame.
batch = df.head(100).copy()

# 1. Change primary key type to string.
batch["movie_index"] = batch["movie_index"].apply(lambda x: str(x))

# 2. Truncate reviews to 512 characters.
batch["text"] = batch["text"].apply(lambda x: x[:512])

# 3. Add embeddings as new column in df.
review_embeddings = retriever.encode(batch['text']).tolist()
# Quick check if embeddings are normalized.
norms = np.linalg.norm(review_embeddings, axis=1)
assert np.allclose(norms, 1.0, atol=1e-5) == True

# 4. Convert the embeddings to np.float32
converted_values = list(map(np.float32, review_embeddings))
batch['embeddings'] = converted_values

# 5. Reorder columns so pk first, labels at end.
new_order = ["movie_index", "text", "embeddings", "label_int", "label"]
batch = batch[new_order]

display(batch.head(2))
assert len(batch.text[0]) == 512
assert len(batch.embeddings[0]) == TOKENIZER_EMBEDDING_LENGTH
print(batch.dtypes)
print(f"type embeddings: {type(batch.embeddings[0])}, {type(batch.embeddings[0][0])}")

# milvus field random, only supports list
# milvus field embeddings, supports numpy.ndarray and list