In [None]:
%pip install langchain pypdf pymongo langchain-openai tiktoken unstructured unstructured[local-inference] "unstructured[pdf]"


In [None]:
import os

%pip show langchain

from platform import python_version
print(python_version())

In [None]:
import json
import boto3
from botocore.exceptions import ClientError

def get_secret():

    secret_name = "hackathon"
    region_name = "us-east-1"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        # For a list of exceptions thrown, see
        # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
        raise e

    secret = json.loads(get_secret_value_response['SecretString'])
    print(secret)
    return secret

In [None]:
secret = get_secret()

openai_api_key = secret["OPENAI_API_KEY"]
os.environ['OPENAI_API_KEY'] = openai_api_key

MONGODB_ATLAS_CLUSTER_URI = secret["MONGODB_ATLAS_CLUSTER_URI_VOID"]
os.environ['MONGODB_ATLAS_CLUSTER_URI'] = MONGODB_ATLAS_CLUSTER_URI

MONGODB_ATLAS_CLUSTER_URI_VOID = secret["MONGODB_ATLAS_CLUSTER_URI_VOID"]
os.environ['MONGODB_ATLAS_CLUSTER_URI'] = MONGODB_ATLAS_CLUSTER_URI

langsmith_api_key = secret["LANGSMITH_API_KEY"]
os.environ['LANGSMITH_API_KEY'] = langsmith_api_key



In [None]:
from pymongo import MongoClient

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI_VOID)

DB_NAME = "Void_DB"
COLLECTION_NAME = "Outages"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

print(MONGODB_COLLECTION)

Now we will setup the environment variables for the MongoDB Atlas cluster


 In the below example, embedding is the name of the field that contains the embedding vector. Please refer to the documentation to get more details on how to define an Atlas Vector Search index. You can name the index {ATLAS_VECTOR_SEARCH_INDEX_NAME} and create the index on the namespace {DB_NAME}.{COLLECTION_NAME}. Finally, write the following definition in the JSON editor on MongoDB Atlas:

~~~
{
 "fields": [{
   "type": "vector",
   "path": "embedding",
   "numDimensions": 1536,
   "similarity": "cosine"
 }]
}



#Load Data

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def load_and_process_docs(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    processed_documents = []
    for index, document in enumerate(documents):
        #print(f"Document {index}: Successfully loaded and processed {document}")
        processed_documents.append(document)
    return processed_documents

def split_docs(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

def show_chunks(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)

    # Get the first 5 chunks
    first_five_chunks = docs[150:155]

    # Print each chunk and its length
    for idx, chunk in enumerate(first_five_chunks, start=1):
        print(f"Chunk {idx}:")
        print(chunk)
    return


In [None]:
directory = "/home/ec2-user/SageMaker/Hackathon-1-24-24/void/"
documents = load_and_process_docs(directory)
docs = split_docs(documents)
print(f"{len(docs)} chunks successfully loaded and processed")

In [None]:
show_chunks(docs)

In [None]:
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [None]:
# Perform a similarity search between the embedding of the query and the embeddings of the documents
query = "Summarize the Outages"
results = vector_search.similarity_search(query)
print(results[0])

print(results[0].page_content)