In [59]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq  
import os
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
DATA_PATH='Banking_data/'
def load_pdf_files(data):
    loader=DirectoryLoader(data,
                           glob='*.pdf',
                           loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

documents=load_pdf_files(data=DATA_PATH)
print('Length of PDF Pages', len(documents))

Length of PDF Pages 968


In [61]:
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=create_chunks(extracted_data=documents)
print('Length of Text Chunks', len(text_chunks))

Length of Text Chunks 3134


In [62]:
def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding_model

embedding_model=get_embedding_model()

In [63]:
# DB_FAISS_PATH='vectorstore/db_faiss'
# vectorstore=FAISS.from_documents(text_chunks, embedding_model)
# vectorstore.save_local(DB_FAISS_PATH)

In [64]:
DB_FAISS_PATH = 'vectorstore/db_faiss'
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)

In [82]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

def search(text: str, top_k: int):

    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    keyword_retriever = BM25Retriever.from_documents(text_chunks)
    keyword_retriever.k = top_k

    ensemble_retriever = EnsembleRetriever(
        retrievers=[retriever, keyword_retriever],
        weights=[0.5, 0.5]
    )    
    return ensemble_retriever.get_relevant_documents(text)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
decision_system_prompt = """Your job is to decide if a given question can be answered with the given context. 
If context can answer the question return 1.
If not return 0.

Do not return anything else except 0 or 1.

Question: {question}
Context: {context}
"""

In [85]:
system_prompt = """You are an expert for answering questions. Answer the question according only to the given context.
If question cannot be answered using the context, simply say I don't know. Do not make stuff up.
Your answer MUST be informative, concise, and action driven. Your response must be in Markdown.

Context: {context}
"""

user_prompt = """
Question: {question}

Answer:"""

In [86]:
question = "what is Risk Governance Framework?"
results = search(question, top_k=5)
context = format_docs(results)

In [None]:
GROQ_API_KEY = "your_groq_api_key_here" 

def load_llm():
    llm = ChatGroq(
        api_key=GROQ_API_KEY,
        model_name="llama3-70b-8192", 
        temperature=0.9,
        max_tokens=512
    )
    return llm


# Example usage
llm = load_llm()

response = llm.invoke([
    {"role": "system", "content": decision_system_prompt.format(question=question, context=context)},
    {"role": "user", "content": "Decide now"}
])

# Extract model’s text
has_answer = response.content
print(has_answer)

1


In [None]:
# def format_ddg_results(results):
#     formatted = []
#     for i, doc in enumerate(results, start=1):
#         formatted.append(f"--- Result {i} ---\n{doc.page_content}\n{doc.metadata}\n")
#     return "\n".join(formatted)

# formatted_output = format_ddg_results(results)

# print(formatted_output)

from duckduckgo_search import DDGS
def format_ddg_results(results):
    formatted = []
    for i, doc in enumerate(results, start=1):
        title = doc.get("title", "No title")
        url = doc.get("href", "No URL")
        body = doc.get("body", "No description")
        formatted.append(f"--- Result {i} ---\nTitle: {title}\nURL: {url}\nSnippet: {body}\n")
    return "\n".join(formatted)

results = DDGS().text(question, max_results=5) 

formatted_output = format_ddg_results(results)
print(formatted_output)

  results = DDGS().text(question, max_results=5)





In [None]:
from IPython.display import display, Markdown
from duckduckgo_search import DDGS

if has_answer == '1':
    print("Context can answer the question")
    response = llm.invoke([
        {"role": "system", "content": system_prompt.format(context=context)},
        {"role": "user", "content": user_prompt.format(question=question)}
    ])
    print("Answer:")
    display(Markdown(response.content))

else:
    print("Context is NOT relevant. Searching online...")
    results = DDGS().text(question, max_results=5)
    context = format_ddg_results(results)
    print("Found online sources. Generating the response...")
    response = llm.invoke([
        {"role": "system", "content": system_prompt.format(context=context)},
        {"role": "user", "content": user_prompt.format(question=question)}
    ]) 
    print("Answer:")
    display(Markdown(response.content))

Context can answer the question
Answer:


**Risk Governance Framework**

The Risk Governance Framework is a framework through which the Board and management establish and make decisions about the bank's strategy and approach to risk management; articulate and monitor adherence to the risk appetite and risk limits relative to the bank's strategy; and identify, measure, manage and control risks.