In [1]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

In [3]:
# Function to load data from the general PDF
def load_pdf_data(pdf_path):
    """
    Function to load text data from a PDF file.
    """
    loader = PyPDFLoader(file_path=pdf_path)
    documents = loader.load()
    return documents

# Load the general PDF
general_documents = load_pdf_data(pdf_path='/Users/sayo/personal_projects/Usafe_bot/data/general_one.pdf')

In [4]:
# Function to split the loaded document into chunks
def split_documents(documents, chunk_size=800, chunk_overlap=80):
    """
    Function to split documents into chunks of given size and overlap.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

In [6]:
# Split the general document into chunks
general_one_chunks = split_documents(general_documents)
print(f"Number of chunks in general document: {len(general_one_chunks)}")

Number of chunks in general document: 22


In [7]:
# Function to create the embedding vector database for the second vector store
def create_embedding_vector_db(chunks, db_name):
    """
    Function to create embeddings and store them in a vector database using FAISS.
    """
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    vectorstore.save_local(f"./vector_databases/vector_db_{db_name}")

In [9]:
# Create the vector store specifically for the general information
create_embedding_vector_db(chunks=general_one_chunks, db_name='usafe_general')

  from tqdm.autonotebook import tqdm, trange


In [11]:
# Function to retrieve data from the new vector store
def retrieve_from_vector_db(vector_db_path):
    """
    Function to get a retriever object from the new vector database.
    """
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    react_vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True
    )
    retriever = react_vectorstore.as_retriever()
    return retriever

In [12]:
# Retrieve from the newly created vector store for general information
general_retriever = retrieve_from_vector_db(vector_db_path='./vector_databases/vector_db_usafe_general')

In [13]:
type(general_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

In [16]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [17]:
import warnings
warnings.filterwarnings("ignore")
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [18]:
retrieval_chain = connect_chains(retriever=general_retriever)

In [21]:
def clean_text(text):
    """
    Function to clean the extracted text by removing extra line breaks and spaces.
    """
    # Replace line breaks with spaces and remove extra spaces
    text = text.replace("\n", " ")
    text = " ".join(text.split())  # Removes multiple spaces
    return text

# Test retrieval with cleaned output
def print_output(inquiry, retriever=general_retriever):
    """
    Function to test the retrieval chain with a query and clean up the output.
    """
    docs = retriever.get_relevant_documents(inquiry)
    if docs:
        # Clean the retrieved document content
        cleaned_text = clean_text(docs[0].page_content)
        print("Cleaned Document Content:", cleaned_text)
    else:
        print("No relevant information found.")

# Example query to test the retrieval
print_output("How do I report a hate crime?")

Cleaned Document Content: • Alternative Support: If you don’t know someone who can assist, you may contact organizations that provide translation support for hate crime victims. One example is ReachOut Berlin, which offers assistance for individuals facing hate crime incidents. • Contact: • ReachOut Berlin • Email: info@reachoutberlin.de • Address: Oranienburger Str. 27, 10117 Berlin 4. Visit Your Local Police Station: • Bring all collected documentation with you. Explain the details of the incident, and let the officer know you believe it to be a hate crime. • The police will create an official report based on your statement and evidence. 5. Report Online (Optional): • If you’re unable to visit the police station, you may be able to file a report online through local authorities’ websites or specific online


In [22]:
def clean_and_format_text(text):
    """
    Clean the extracted text and format it for better readability.
    """
    # Replace newlines with spaces and remove multiple spaces
    text = text.replace("\n", " ")
    text = " ".join(text.split())
    
    # Add line breaks after bullets and section numbers for better readability
    text = text.replace("•", "\n•")
    text = text.replace("1.", "\n1.")
    text = text.replace("2.", "\n2.")
    text = text.replace("3.", "\n3.")
    text = text.replace("4.", "\n4.")
    text = text.replace("5.", "\n5.")
    
    return text

def print_output(inquiry, retriever=general_retriever):
    """
    Function to test the retrieval chain with a query and display organized output.
    """
    docs = retriever.get_relevant_documents(inquiry)
    if docs:
        # Clean and format the retrieved document content
        cleaned_text = clean_and_format_text(docs[0].page_content)
        print("Cleaned Document Content:\n")
        print(cleaned_text)
    else:
        print("No relevant information found.")

# Example query to test the retrieval
print_output("How do I report a hate crime?")

Cleaned Document Content:


• Alternative Support: If you don’t know someone who can assist, you may contact organizations that provide translation support for hate crime victims. One example is ReachOut Berlin, which offers assistance for individuals facing hate crime incidents. 
• Contact: 
• ReachOut Berlin 
• Email: info@reachoutberlin.de 
• Address: Oranienburger Str. 27, 10117 Berlin 
4. Visit Your Local Police Station: 
• Bring all collected documentation with you. Explain the details of the incident, and let the officer know you believe it to be a hate crime. 
• The police will create an official report based on your statement and evidence. 
5. Report Online (Optional): 
• If you’re unable to visit the police station, you may be able to file a report online through local authorities’ websites or specific online
