# Using VectorDB

In [11]:
import boto3
from langchain.llms.bedrock import Bedrock
from langchain.embeddings.bedrock import BedrockEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import FAISS

import os
from datetime import datetime

import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

from botocore.config import Config
retry_config = Config(
        region_name = 'us-east-1',
        retries = {
            'max_attempts': 10,
            'mode': 'standard'
        }
)


In [12]:
import boto3

# Set your AWS credentials explicitly
aws_access_key_id="access_key_id",
aws_secret_access_key="secret_access_key",
service_name='bedrock-runtime', 
region_name='us-east-1'

# Create a Bedrock Runtime client
bedrock = boto3.client(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    service_name="bedrock-runtime",
    region_name=region_name
)

# Create an S3 client using a session
session = boto3.session.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=region_name
)

s3 = session.client('s3')


In [13]:
# Creating boto3 session by passing profile information. Profile can be parametrized depeding upon the env you are using
profile_name = 'test-demo'
session = boto3.session.Session(profile_name='test-demo')

bedrock = session.client(
    service_name="bedrock-runtime",
    region_name="us-east-1" 
)

# Create an S3 client using the session
s3 = session.client('s3')


"""" 
btot3 provides two different client to ivoke bedrock operation.
1. bedrock : creating and managing Bedrock models.
2. bedrock-runtime : Running inference using Bedrock models.
"""
boto3_bedrock = session.client("bedrock", config=retry_config)
boto3_bedrock_runtime = session.client("bedrock-runtime", config=retry_config)


'''
We will implement RAG architecture. The goal is to build vector store (Knowedge base to reduce hallucinations) 
so that model can refer to data we have provided.

To achieve this, we need to first source data (this can be archived PDF/docs/txt/csv/anyother datastore even sql tables) 
So the pipeline will be.
 1. Source datasets.
 2. Update If any transformation required. 
 3. Split and create chunks. [Used in NLP. It requires optimization to get  better output.]
 4. Create embedding using embedding modules [Can be used various modules available]
'''

EMBEDDINGS_MODEL_ID='amazon.titan-embed-text-v1'
brrkEmbeddings = BedrockEmbeddings(model_id=EMBEDDINGS_MODEL_ID,client=boto3_bedrock_runtime,)


In [14]:
!pip install pypdf



In [15]:
def create_embeddings(directory_path):
    print(f"Loading directory {directory_path}")
    loader = PyPDFDirectoryLoader(directory_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 100,
    )
    docs = text_splitter.split_documents(documents)
    avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
    #avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents]) // len(documents) if documents else 0

    avg_char_count_pre = avg_doc_length(documents)
    avg_char_count_post = avg_doc_length(docs)
    print(f'Average length among {len(documents)} documents loaded is {avg_char_count_pre} characters.')
    print(f'After the split we have {len(docs)} documents more than the original {len(documents)}.')
    print(f'Average length among {len(docs)} documents (after split) is {avg_char_count_post} characters.')
    sample_embedding = np.array(brrkEmbeddings.embed_query(docs[0].page_content))
    print("Sample embedding of a document chunk: ", sample_embedding)
    print("Size of the embedding: ", sample_embedding.shape)
    
    print("Storing in a vector store")
    try:
        vector_store = FAISS.from_documents(docs, brrkEmbeddings)
    except Exception:
        raise Exception("Failed to create vector store")
    print("Created vector store")
    return vector_store 

In [16]:
data_path = "C:\\Users\\RT\\OneDrive\\Desktop\\file"
vectorartifacts=create_embeddings(data_path)

Loading directory C:\Users\RT\OneDrive\Desktop\file
Average length among 16 documents loaded is 1517 characters.
After the split we have 35 documents more than the original 16.
Average length among 35 documents (after split) is 744 characters.
Sample embedding of a document chunk:  [ 0.19335938 -0.01031494  0.01507568 ...  0.23535156 -0.06787109
 -0.37890625]
Size of the embedding:  (1536,)
Storing in a vector store
Created vector store


In [17]:
"""
Vector Store: FAISS available through LangChain
In this notebook we are using in-memory vector-store to store both the embeddings and the documents.
In an enterprise context this could be replaced with a persistent store such as AWS OpenSearch, RDS Postgres with pgVector, ChromaDB, Pinecone or Weaviate.

"""
#vectorartifacts = create_embeddings(your_directory_path)
# Call the function and store its return value
vectorartifacts = create_embeddings("C:\\Users\\RT\\OneDrive\\Desktop\\file")


Loading directory C:\Users\RT\OneDrive\Desktop\file
Average length among 16 documents loaded is 1517 characters.
After the split we have 35 documents more than the original 16.
Average length among 35 documents (after split) is 744 characters.
Sample embedding of a document chunk:  [ 0.19335938 -0.01031494  0.01507568 ...  0.23535156 -0.06787109
 -0.37890625]
Size of the embedding:  (1536,)
Storing in a vector store
Created vector store


In [18]:
def save_local_vector_store(vector_store, vector_store_path):
    time_now = datetime.now().strftime("%d%m%Y%H%M%S")
    vector_store_path=vector_store_path+'/'+time_now+'.vs'
    try:
        if vector_store_path == "":
            vector_store_path = f"../vector_store/{time_now}.vs"
        os.makedirs(os.path.dirname(vector_store_path), exist_ok=True)
        vector_store.save_local(vector_store_path)
        with open(f"{vector_store_path}/embeddings_model_id", 'w') as f:
            f.write(EMBEDDINGS_MODEL_ID)
    except Exception:
        print("Failed to save vector store, continuing without saving...")
    return vector_store_path

vector_store_path=os.getcwd()+'/'+'vectorstore'
print("Vector store path: {}".format(vector_store_path))

save_local_vector_store_path=save_local_vector_store(vectorartifacts,vector_store_path)
print(f"Vector store got created in {save_local_vector_store_path}")


def load_local_vector_store(vector_store_path):
    try:
        with open(f"{vector_store_path}/embeddings_model_id", 'r') as f:
            embeddings_model_id = f.read()
        vector_store = FAISS.load_local(vector_store_path, brrkEmbeddings)
        print("Loaded vector store")
        return vector_store
    except Exception:
        print("Failed to load vector store, continuing creating one...")
        
print(load_local_vector_store(save_local_vector_store_path))

Vector store path: C:\Users\RT/vectorstore
Vector store got created in C:\Users\RT/vectorstore/16112023101133.vs
Loaded vector store
<langchain.vectorstores.faiss.FAISS object at 0x000001C3F16D8210>
