In [None]:
%pip install chromadb

In [None]:
import json
import boto3
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb

Setting Up the Environment

In [None]:
os.environ['AWS_DEFAULT_REGION'] = "your asw region"
os.environ['AWS_PROFILE']="your profile"

Embedding Model

In [None]:
class TitanEmbeddings(object):
    accept = "application/json"
    content_type = "application/json"
    def __init__(self, model_id):
        self.bedrock = boto3.client(service_name='your service')
        self.model_id = model_id
    def __call__(self, text):
        """
        Returns Embeddings
        Args:
            text (str): text to embed
            dimensions (int): Number of output dimensions.
            normalize (bool): Whether to return the normalized embedding or not.
        Return:
            List[float]: Embedding
            
        """
        body = json.dumps({
            "inputText": text,
            "dimensions": 256,
            "normalize": True
        })

        response = self.bedrock.invoke_model(
            modelId=self.model_id,body=body,accept=self.accept, contentType=self.content_type
        )
        

        response_body = json.loads(response.get('body').read())

        return response_body['embedding']

Preparing the data

In [None]:
titan_embeddings_v2 = TitanEmbeddings(model_id="your embedding model")
#Preparing the data
loader = DirectoryLoader("new_articles/", glob = "./*.txt", loader_cls= TextLoader)
document = loader.load()
# print(document)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 200)
texts = text_splitter.split_documents(document)
print(len(texts))

Inserting the embeddings in the chromadb


In [None]:
client = chromadb.Client()


# Create or get a collection , 500 = chunk_size , 100 = chunk_overlap (default is 1000,100)
collection_name = "embeddings_collection_600_200"
collection = client.get_or_create_collection(name=collection_name)


print("Creating vector Database...\n")

pos=0
for doc in texts:
    embedding = titan_embeddings_v2(doc.page_content)
    metadata = doc.metadata
    
    # Store the embedding with associated metadata
    collection.add(
        embeddings=[embedding],  # List of embeddings
        metadatas=[metadata],    # List of metadata dicts
        ids=[f"doc_{pos}"]  # Unique IDs for each embedding, make sure to generate unique IDs for each document
    )
    pos+=1

print("Vector Db creation Done...")

Querying....

In [None]:
query_text=input()
collection_name = "embeddings_collection_600_200"
collection = client.get_collection(name=collection_name)

query_embedding = titan_embeddings_v2(query_text)


# Query the collection
results = collection.query(
    query_embeddings=[query_embedding],  # Embedding to query
    n_results=2                          # Number of similar results to retrieve
)

# Display results
print(f"Size: {len(results)}")
print(results["ids"])
print(results["distances"])
print(results["documents"])
print(results["metadatas"])
print(results["uris"])