# Summary

Notebook to run the embedding process for multiple documents

In [1]:
from embed_documents_3 import EmbedDocuments

## Test

In [3]:
# Set paths
input_text_path = ("/Users/stephengodfrey/OneDrive - numanticsolutions.com"
                   "/Engagements/Projects/ccc_policy_assistant/data/crawls/")
embeddings_path = ("/Users/stephengodfrey/OneDrive - numanticsolutions.com"
                   "/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai")
collection_name = "crawl_docs1"
gcs_bucket_name = "ccc-chromadb-vai"

embed_loader = EmbedDocuments(input_text_path=input_text_path,
                              embeddings_path=embeddings_path,
                              collection_name=collection_name,
                              gcs_bucket_name=gcs_bucket_name)

# Read input files
embed_loader.get_input_filenames()

# Read input files
# embed_loader.read_text_data()

# Chunk text
# embed_loader.chunk_input_text()

# Embed
# embed_loader.embed()

# Copy Embeddings to GCS
embed_loader.copy_embeddings_to_gcs()


/Users/stephengodfrey/OneDrive - numanticsolutions.com/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai
Uploaded /Users/stephengodfrey/OneDrive - numanticsolutions.com/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai/chroma.sqlite3 to gs://ccc-chromadb-vai/chroma.sqlite3
Uploaded /Users/stephengodfrey/OneDrive - numanticsolutions.com/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai/22e671e4-4c48-4404-b154-f1a722653a26/data_level0.bin to gs://ccc-chromadb-vai/22e671e4-4c48-4404-b154-f1a722653a26/data_level0.bin
Uploaded /Users/stephengodfrey/OneDrive - numanticsolutions.com/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai/22e671e4-4c48-4404-b154-f1a722653a26/length.bin to gs://ccc-chromadb-vai/22e671e4-4c48-4404-b154-f1a722653a26/length.bin
Uploaded /Users/stephengodfrey/OneDrive - numanticsolutions.com/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai/22e671e4-4c48-4404-b154-f1a722653a26/link_lists.bin to gs://ccc-chromadb

## Test the local embeddings

In [2]:
import os
import langchain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
import vertexai

import chromadb

In [3]:
embeddings_path = ("/Users/stephengodfrey/OneDrive - numanticsolutions.com"
                   "/Engagements/Projects/ccc_policy_assistant/data/embeddings-vai")

collection_name = "langchain"

client = chromadb.PersistentClient(path=embeddings_path)

collection = client.get_collection(name=collection_name)

documents = collection.get()

In [4]:
project_id = "eternal-bongo-435614-b9"
location = "us-central1"
vertexai.init(project=project_id, location=location)

embedding_model = "textembedding-gecko@003"

llm = VertexAI(
    model="gemini-1.5-pro",
    max_output_tokens=2048,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)


# retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
# retriever = client.as_retriever(search_type="similarity", search_kwargs={"k": 3})

question = "what college is designated a Center of Excellence in bioprocessing?"
template = """
Question: {question}
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
# prompt = PromptTemplate.from_template(template)

embeddings = VertexAIEmbeddings(model_name=embedding_model)

embedded_prompt = embeddings.embed_query(question)

n_neighbors = 3
results = collection.query(
    query_embeddings=embedded_prompt,
    n_results=n_neighbors
)



In [9]:
type(results)

print(results.keys())

# results["metadatas"]
# results["documents"]

results["distances"]


dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'distances', 'included'])


[[0.6776347756385803, 0.6879165172576904, 0.7347260117530823]]