In [35]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [36]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
import pdfplumber
from langchain.docstore.document import Document


In [37]:
def load_pdf_with_plumber(pdf_path):
    """
    Function to extract text from a PDF using pdfplumber and return a list of Document objects.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    
    # Return as a list of Document objects
    return [Document(page_content=text, metadata={"source": pdf_path})]

# Load the general PDF using pdfplumber
general_documents = load_pdf_with_plumber('/Users/sayo/personal_projects/Usafe_bot/data/general_one.pdf')

In [38]:
# Function to split the loaded document into chunks
def split_documents(documents, chunk_size=1024, chunk_overlap=200):
    """
    Function to split documents into chunks of given size and overlap.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

In [39]:
# Split the general document into chunks
general_one_chunks = split_documents(general_documents)
print(f"Number of chunks in general document: {len(general_one_chunks)}")

Number of chunks in general document: 16


In [40]:
# Function to create the embedding vector database for the second vector store
def create_embedding_vector_db(chunks, db_name):
    """
    Function to create embeddings and store them in a vector database using FAISS.
    """
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    vectorstore.save_local(f"./vector_databases/vector_db_{db_name}")

In [41]:
# Create the vector store specifically for the general information
create_embedding_vector_db(chunks=general_one_chunks, db_name='usafe_general')

In [42]:
# Function to retrieve data from the new vector store
def retrieve_from_vector_db(vector_db_path):
    """
    Function to get a retriever object from the new vector database.
    """
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    react_vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True
    )
    retriever = react_vectorstore.as_retriever()
    return retriever

In [43]:
# Retrieve from the newly created vector store for general information
general_retriever = retrieve_from_vector_db(vector_db_path='./vector_databases/vector_db_usafe_general')

In [44]:
type(general_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

In [45]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [46]:
import warnings
warnings.filterwarnings("ignore")
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [47]:
retrieval_chain = connect_chains(retriever=general_retriever)

In [48]:
def print_output(
    inquiry,
    retrieval_chain=retrieval_chain
):
    result = retrieval_chain.invoke({"input": inquiry})
    print(result['answer'].strip("\n"))

In [52]:
print_output("i faced a hate crime and i want to understand the motivations behind a hate crime?")

I'm so sorry to hear that you've faced a hate crime. Understanding the motivations behind hate crimes can be complex and multifaceted. According to sociologists Jack McDevitt and Jack Levin, there are four primary motives identified behind hate crimes:

1. Thrill-seeking: Crimes for excitement, often by groups, targeting perceived vulnerable groups. This motive suggests that some individuals may commit hate crimes as a way to experience a rush or excitement, often by targeting groups they perceive as vulnerable or easy to attack.
2. Defensive: Motivated by a belief that they are protecting their community from perceived threats. This motive implies that some individuals may commit hate crimes as a way to feel like they are defending their community or group from perceived threats or dangers.
3. Retaliatory: Crimes committed in revenge for other incidents or perceived offenses. This motive suggests that some individuals may commit hate crimes as a way to retaliate or exact revenge for p