In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
from langchain_community.document_loaders import PyPDFLoader

def load_pdf_data(pdf_path):
    """
    this function loads text data from pdf file
    """
    loader = PyPDFLoader(file_path=pdf_path)
    documents = loader.load()
    return documents

In [7]:

anti_religious = safe_load_pdf_data(pdf_path='/Users/sayo/personal_projects/Usafe_bot/data/anti_religious_def.pdf')
gender_lgbt = safe_load_pdf_data(pdf_path='/Users/sayo/personal_projects/Usafe_bot/data/gender_lgbt_def.pdf')
general = safe_load_pdf_data(pdf_path='/Users/sayo/personal_projects/Usafe_bot/data/general.pdf')
racist = safe_load_pdf_data(pdf_path='/Users/sayo/personal_projects/Usafe_bot/data/racist_def.pdf')



In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=800, chunk_overlap=80):
    """
    this function splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks

In [10]:
# Split each loaded document into chunks
anti_religious_chunks = split_documents(anti_religious)
gender_lgbt_chunks = split_documents(gender_lgbt)
general_chunks = split_documents(general)
racist_chunks = split_documents(racist)

total_chunks = len(general_chunks) + len(racist_chunks) + len(gender_lgbt_chunks) + len(anti_religious_chunks)
print(f"number of chunks: {total_chunks}")

number of chunks: 355


In [11]:
all_chunks = anti_religious_chunks + gender_lgbt_chunks + general_chunks + racist_chunks

In [12]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

def create_embedding_vector_db(chunks, db_name):
    """
    this function uses the open-source embedding model HuggingFaceEmbeddings 
    to create embeddings and store those in a vector database called FAISS, 
    which allows for efficient similarity search
    """
    # instantiate embedding model
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    # save vector database locally
    vectorstore.save_local(f"./vector_databases/vector_db_{db_name}")

In [13]:
create_embedding_vector_db(chunks=all_chunks, db_name='usafe_combined')

  from tqdm.autonotebook import tqdm, trange


In [14]:
def retrieve_from_vector_db(vector_db_path):
    """
    this function spits out a retriever object from a local vector database
    """
    # instantiate embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    react_vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True
    )
    retriever = react_vectorstore.as_retriever()
    return retriever

In [15]:
combined_retriever = retrieve_from_vector_db(vector_db_path='./vector_databases/vector_db_usafe_combined')

In [16]:
type(combined_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

In [17]:
with open('/Users/sayo/personal_projects/Usafe_bot/data/usafe_prompt.txt', 'r') as file:
    user_prompt = file.read()

print(user_prompt)


   Usafe ChatBot Guide:

   Initial Introduction:
   Introduce Usafe to the user. Explain that their information is confidential and encourage them to share what hate crime happened to them. Ensure the message is supportive and non-judgmental because they must be traumatized. Don't be too long, be concise and to the point.

   Fallback Response:
   Provide a supportive fallback response if the user’s input is unclear or doesn’t clearly indicate a hate crime. Acknowledge their trust, ask for additional information, and offer general guidance if needed.
   Add that if they are unsure, they can ask for more information or help from a human. Explain you are a bot and sometimes you don't grasp the full context.

   Experience Acknowledgment:
   Acknowledge the user's experience empathetically. Based on the input crime type, describe the type of hate crime they may have experienced: Gender-based hate crime involves discrimination or violence directed at individuals based on gender identity 

## Generation 

[`create_stuff_documents_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html#langchain.chains.combine_documents.stuff.create_stuff_documents_chain)

- takes a list of documents and formats them all into a prompt, then passes that prompt to an LLM
- passes ALL documents, so you should make sure it fits within the context window of the LLM being used

[`create_retrieval_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html#langchain.chains.retrieval.create_retrieval_chain)

- takes in a user inquiry, which is then passed to the retriever to fetch relevant documents
- those documents (and original inputs) are then passed to an LLM to generate a response

In [18]:
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

In [19]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [20]:
import warnings
warnings.filterwarnings("ignore")
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0.02,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [21]:
react_retrieval_chain = connect_chains(combined_retriever)

In [31]:
def print_output(
    inquiry,
    retrieval_chain=react_retrieval_chain
):
    result = retrieval_chain.invoke({"input": inquiry})
    print(result['answer'].strip("\n"))

In [32]:
print_output("Please provide information about racist content.")

Based on the provided context, here are some examples of racist content:

1. Racist insults: Several instances of people being subjected to racist insults, including towards an Iraqi woman and her children, an Eritrean man, a Nigerian man, and three men from India.
2. Physical violence: Dogs being unleashed on an Iraqi woman and her children, resulting in minor injuries to the children. A person seeking asylum being beaten and requiring medical treatment. A man being punched in the face outside a restaurant.
3. Xenophobic insults: A taxi driver of Polish origin being subjected to xenophobic insults and kicked by a customer. A man being subjected to xenophobic insults and death threats by a group wielding a knife.
4. Swastika graffiti: The windows and facade of a police station being vandalized with swastika graffiti.
5. Vandalism: A house with a poster denouncing racism being vandalized with swastika and neo-Nazi graffiti.
6. Racist and anti-Semitic messages: Racist and anti-Semitic me

In [35]:
def print_output(inquiry, retrieval_chain=react_retrieval_chain):
    result = retrieval_chain.invoke({"input": inquiry})
    print(result)

# Example usage
print_output("Please provide information about racist content.")

{'input': 'Please provide information about racist content.', 'context': [Document(metadata={'source': '/Users/sayo/personal_projects/Usafe_bot/data/racist_def.pdf', 'page': 5}, page_content='-\nAn\nIraqi\nwoman\nand\nher\ntwo\nchildren\nwere\nsubjected\nto\nracist\ninsults\nand\nhad\nthree\ndogs\nunleashed\non\nthem\nby\na\ngroup.\nThe\ndogs\nknocked\nthe\nchildren\nover,\nresulting\nin\nminor\ninjuries.\n-\nA\nperson\nseeking\nasylum\nwas\nsubjected\nto\nracist\ninsults\nand\nbeaten\nby\na\ngroup.\nThe\nvictim\nsustained\ninjuries\nand\nrequired\nmedical\ntreatment.\n-\nAn\nEritrean\nman\nwas\nsubjected\nto\nracist\ninsults\nand\npunched\nin\nthe\nface\noutside\na\nrestaurant.'), Document(metadata={'source': '/Users/sayo/personal_projects/Usafe_bot/data/racist_def.pdf', 'page': 6}, page_content='-\nThree\nmen\nwere\nbeaten\nby\na\nsecurity\nguard\nat\na\nrestaurant\nafter\none\nof\nthe\nvictims\nwas\nsubjected\nto\nracist\ninsults\nby\nthe\nsame\nperpetrator.\nTwo\nof\nthe\nvictims\n

In [50]:
def print_output(inquiry, retrieval_chain=react_retrieval_chain):
    result = retrieval_chain.invoke({"input": inquiry})
    
    print("User Inquiry:", result['input'])
    print("\nAnswer:\n", result['answer'])
    
    print("Context Documents Used:")
    for i, doc in enumerate(result['context'], start=1):
        print(f"\nDocument {i}:")
        print(f"  Source: {doc.metadata['source']} - Page: {doc.metadata['page']}")
        print("  Content Snippet:", doc.page_content.replace("\n", " ")[:150], "...")  

# Example usage
print_output("Please provide information about xenophobic assault.")

User Inquiry: Please provide information about xenophobic assault.

Answer:
 Based on the provided context, here are some key points about xenophobic assault:

1. Xenophobic assault: Xenophobic assault refers to physical or verbal attacks on individuals based on their nationality, ethnicity, or perceived foreignness.
2. Frequency: Xenophobic assaults are a recurring issue, with multiple incidents reported in the provided context.
3. Victims: The victims of xenophobic assaults are often asylum seekers, refugees, or individuals from diverse ethnic backgrounds.
4. Perpetrators: The perpetrators of xenophobic assaults are often individuals with a history of hate crimes or racist behavior.
5. Methods: Xenophobic assaults can take various forms, including physical violence, verbal abuse, and threats.
6. Locations: Xenophobic assaults can occur in various locations, such as communal kitchens, train stations, sports centers, and streets.
7. Injuries: Victims of xenophobic assaults may sustain 

In [37]:
def print_output(inquiry, retrieval_chain=react_retrieval_chain):
    result = retrieval_chain.invoke({"input": inquiry})
    
    print("Answer:\n", result['answer'])
    
    print("\nContext Documents Used:")
    for i, doc in enumerate(result['context'], start=1):
        print(f"\nDocument {i}:")
        print(f"  Source: {doc.metadata['source']} - Page: {doc.metadata['page']}")
        print("  Content Snippet:", doc.page_content[:200], "...")  # Print a snippet to keep it concise

# Example usage
print_output("Please provide information about racist content.")

Answer:
 Based on the provided context, here are some examples of racist content:

1. Racist insults: Several instances of people being subjected to racist insults, including towards an Iraqi woman and her children, an Eritrean man, a Nigerian man, and three men from India.
2. Physical violence: Dogs being unleashed on an Iraqi woman and her children, resulting in minor injuries to the children. A person seeking asylum being beaten and requiring medical treatment. A man being punched in the face outside a restaurant.
3. Xenophobic insults: A taxi driver of Polish origin being subjected to xenophobic insults and kicked by a customer. A man being subjected to xenophobic insults and death threats by a group wielding a knife.
4. Swastika graffiti: The windows and facade of a police station being vandalized with swastika graffiti.
5. Vandalism: A house with a poster denouncing racism being vandalized with swastika and neo-Nazi graffiti.
6. Racist and anti-Semitic messages: Racist and anti-S

In [38]:
def print_output(inquiry, retrieval_chain=react_retrieval_chain):
    result = retrieval_chain.invoke({"input": inquiry})
    
    print("Answer:\n", result['answer'])
    
    print("\nSources Used:")
    sources = {doc.metadata['source'] for doc in result['context']}  # Use a set to avoid duplicates
    for source in sources:
        print(f"  - {source}")

# Example usage
print_output("Please provide information about racist content.")

Answer:
 Based on the provided context, here are some examples of racist content:

1. Racist insults: Several instances of people being subjected to racist insults, including towards an Iraqi woman and her children, an Eritrean man, a Nigerian man, and three men from India.
2. Physical violence: Dogs being unleashed on an Iraqi woman and her children, resulting in minor injuries to the children. A person seeking asylum being beaten and requiring medical treatment. A man being punched in the face outside a restaurant.
3. Xenophobic insults: A taxi driver of Polish origin being subjected to xenophobic insults and kicked by a customer. A man being subjected to xenophobic insults and death threats by a group wielding a knife.
4. Swastika graffiti: The windows and facade of a police station being vandalized with swastika graffiti.
5. Vandalism: A house with a poster denouncing racism being vandalized with swastika and neo-Nazi graffiti.
6. Racist and anti-Semitic messages: Racist and anti-S