In [1]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

# Embeddings with Chroma Vector DB

In [2]:
import tqdm as notebook_tqdm

## 1. Basic Example

In [None]:
# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [None]:
# load the document and split it into chunks
loader = TextLoader("../data/state_of_the_union.txt")
documents = loader.load()
len(documents)

In [None]:
documents

In [None]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
len(docs)

In [None]:
docs[0]

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

In [None]:
# query it
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

# EMBEDDINGs with FAISS Vector DB

## Data Loader
> Load data from CSV

In [3]:
# langchain
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
loader = CSVLoader(file_path='../data/product_list_dev_enriched_v1.csv')
documents = loader.load()

In [5]:
# divide and conquer
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)
texts = text_splitter.split_documents(documents)

In [6]:
len(texts), texts[0]

(16812,
 Document(page_content='ITEM_ID: 2909-0006-DY\nname: CORCHO CORK ROJO HOJA (07.94) 5/16X9X18" (14 PZ)', metadata={'source': '../data/product_list_dev_enriched_v1.csv', 'row': 0}))

## Get Embedding Helper Func

In [7]:
import pickle
import faiss
from langchain.vectorstores import FAISS

In [8]:
def store_embeddings(docs, embeddings, store_name, path):
    vector_store = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{store_name}.pkl", "wb") as f:
        pickle.dump(vector_store, f)

In [9]:
def load_embeddings(store_name, path):
    with open(f"{path}/faiss_{store_name}.pkl", "rb") as f:
        return pickle.load(f)

## Instructor Embeddings

In [10]:
# InstructorEmbedding
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [11]:
# https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.huggingface.HuggingFaceInstructEmbeddings.html
instructor_embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
    embed_instruction="Represent the pfeifer product: ",
    model_kwargs={"device": "cpu"}
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [14]:
ie_vs_path = '../output/v1/vector_store'

store_embeddings(
    texts,
    instructor_embeddings,
    'instruct_embeddings',
    ie_vs_path
)

In [12]:
db = FAISS.from_documents(texts, instructor_embeddings)

In [13]:
faiss_vs_path = '../output/v1/vector_store/faiss_vector_store'
db.save_local(faiss_vs_path)

In [15]:
docs = db.similarity_search("tinta opaca amarilla", k=5)
print(docs[0].page_content)

ITEM_ID: 1851-0016-HG
name: TINTA PANTONE PLUS BLANCO OPACO 47N5240 (LT 1.50 KG)


In [16]:
docs

[Document(page_content='ITEM_ID: 1851-0016-HG\nname: TINTA PANTONE PLUS BLANCO OPACO 47N5240 (LT 1.50 KG)', metadata={'source': '../data/product_list_dev_enriched_v1.csv', 'row': 432}),
 Document(page_content='ACALTA VEL. 18% TUNGSTENO', metadata={'source': '../data/product_list_dev_enriched_v1.csv', 'row': 2176}),
 Document(page_content='ITEM_ID: 1851-0006-HG\nname: TINTA PANTONE PLUS ROJO RODAMINA (LT 1.00 KG)', metadata={'source': '../data/product_list_dev_enriched_v1.csv', 'row': 590}),
 Document(page_content='ITEM_ID: 1821-0001-HG\nname: TINTA SELECC REFLECTA BIO AMARILLO (LT 2.50 KG)', metadata={'source': '../data/product_list_dev_enriched_v1.csv', 'row': 420}),
 Document(page_content='ITEM_ID: 1851-0001-HG\nname: TINTA PANTONE PLUS AMARILLO (LT 1.00 KG)', metadata={'source': '../data/product_list_dev_enriched_v1.csv', 'row': 586})]

In [17]:
vector_db = FAISS.load_local(faiss_vs_path, instructor_embeddings)

In [18]:
docs = vector_db.similarity_search("tinta opaca amarilla", k=5)
print(docs[0].page_content)

ITEM_ID: 1851-0016-HG
name: TINTA PANTONE PLUS BLANCO OPACO 47N5240 (LT 1.50 KG)
