In [1]:
%pip install langchain pypdf pymongo langchain-openai tiktoken unstructured unstructured[local-inference] "unstructured[pdf]"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

%pip show langchain

from platform import python_version
print(python_version())

Name: langchain
Version: 0.1.2
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, dataclasses-json, jsonpatch, langchain-community, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 
Note: you may need to restart the kernel to use updated packages.
3.10.13


In [3]:
import json
import boto3
from botocore.exceptions import ClientError

def get_secret():

    secret_name = "hackathon"
    region_name = "us-east-1"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        # For a list of exceptions thrown, see
        # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
        raise e

    secret = json.loads(get_secret_value_response['SecretString'])
    print(secret)
    return secret

In [None]:
secret = get_secret()

openai_api_key = secret["OPENAI_API_KEY"]
os.environ['OPENAI_API_KEY'] = openai_api_key

MONGODB_ATLAS_CLUSTER_URI = secret["MONGODB_ATLAS_CLUSTER_URI_VOID"]
os.environ['MONGODB_ATLAS_CLUSTER_URI'] = MONGODB_ATLAS_CLUSTER_URI

MONGODB_ATLAS_CLUSTER_URI_VOID = secret["MONGODB_ATLAS_CLUSTER_URI_VOID"]
os.environ['MONGODB_ATLAS_CLUSTER_URI'] = MONGODB_ATLAS_CLUSTER_URI

langsmith_api_key = secret["LANGSMITH_API_KEY"]
os.environ['LANGSMITH_API_KEY'] = langsmith_api_key

In [5]:
from pymongo import MongoClient

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI_VOID)

DB_NAME = "Void_DB"
COLLECTION_NAME = "Outages"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [6]:
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings

vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    MONGODB_ATLAS_CLUSTER_URI,
    DB_NAME + "." + COLLECTION_NAME,
    OpenAIEmbeddings(disallowed_special=()),
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

#Pre-filtering with Similarity Search
Atlas Vector Search supports pre-filtering using MQL Operators for filtering. Below is an example index and query on the same data loaded above that allows you do metadata filtering on the “page” field. You can update your existing index with the filter defined and do pre-filtering with vector search.

Use:

~~~
{
  "name": "index_name",
  "type": "vectorSearch",
  "fields":[
    {
      "type": "vector",
      "path": "embedding",
      "numDimensions": 1536,
      "similarity": "cosine"
    },
    {
      "type": "filter",
      "path": "page"
    }
  ]
}
~~~


In [7]:
#Don't run withput adding the filers in in the index JSON
query = "Summarize the outages"

results = vector_search.similarity_search_with_score(
    query=query, k=5, pre_filter={"page": {"$eq": 1}}
)

# Display results
for result in results:
    print(result)

#Similarity Search with Score

In [8]:
query = "What are the worst outages of 2023?"

results = vector_search.similarity_search_with_score(
    query=query,
    k=5,
)

# Display results
for result in results:
    print(result)

(Document(page_content='reportedly unusable.\n\nWe will update this story as we learn more.\n\nSubscribe to our daily newsletters\n\nEmail:\n\nCountry:\n\nSelect...\n\nI have read and agree to the Terms and Conditions and Privacy Policy\n\nYes, I agree\n\nSubmit\n\nMore in Outages\n\n10 Nov 2023\n\n16 Jan 2024\n\nWhy you need a Digital Twin of your Data Centers\n\nWidespread Verizon outage impacts North America\n\n"Critical incident" resolved as IT back online at Sussex hospital in UK\n\nTags\n\nAWS AWS Lamba Amazon Web Services\n\n0 Comments\n\n\ue6031 Login\n\nStart the discussion…\n\nLOG IN WITH\n\nOR SIGN UP WITH DISQUS\n\n?\n\nName\n\n\uf109\n\nShare\n\nBest Newest Oldest\n\nBe the \x00rst to comment.\n\nSubscribe\n\nPrivacy\n\nDo Not Sell My Data\n\nWhitepapers\n\nThe iPDU Handbook\n\nWatch Now: Building a digital society - What market trends will be driving the industry forward?\n\n19 Jan 2024\n\nImproving data center reliability and e\x00ciency by solving power quality pain poi

#Question Answering

In [9]:
qa_retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 15},
)

In [10]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [12]:
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=qa_retriever,
    return_source_documents="True",
    chain_type_kwargs={"prompt": PROMPT},
)

docs = qa({"query": "What companies had outages in 2023"})

print(docs["result"])
#print(docs["source_documents"])



Some companies that experienced outages in 2023 include:

1. Amazon Web Services (AWS)

2. Cloudflare

3. Twitter

4. Stack Exchange Network

5. Burger King

6. The Associated Press

7. PlutoTV

8. Hinge

9. Delta

10. Goodreads


In [None]:
qa_retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 15},
)

In [None]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=qa_retriever,
    return_source_documents="True",
    chain_type_kwargs={"prompt": PROMPT},
)

docs = qa({"query": "What is Profound Knowledge?"})

print(docs["result"])
#print(docs["source_documents"])