In [39]:
# Import necessary libraries
import warnings
from dotenv import load_dotenv
import pdfplumber
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_groq import ChatGroq

In [40]:
# Load environment variables
load_dotenv()
warnings.filterwarnings("ignore")

In [38]:
# Define the hate crimes types

HATE_CRIMES_TYPE = {
    'anti_religious_def.pdf': 'Anti-religious Hate Crime',
    'racist_def.pdf': 'Racist and Xenophobic Hate Crime',
    'gender_lgbt_def.pdf': 'Gender and LGBTQ+ Hate Crime'
}

In [41]:
# Step 1: Load PDFs using pdfplumber
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF using pdfplumber and returns a list of Document objects.
    """
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                extracted_text += page_text + "\n"
    return [Document(page_content=extracted_text, metadata={"source": pdf_path})]

In [42]:
# Load PDFs for each category
anti_religious_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/anti_religious_def.pdf')
gender_lgbt_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/gender_lgbt_def.pdf')
general_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/general.pdf')
racist_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/racist_def.pdf')

In [43]:
# Step 2: Chunk Documents
def chunk_documents(documents, chunk_size=800, chunk_overlap=80):
    """
    Splits documents into smaller chunks for embedding.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)


In [44]:
# Chunk each document category
anti_religious_chunks = chunk_documents(anti_religious_docs)
gender_lgbt_chunks = chunk_documents(gender_lgbt_docs)
general_chunks = chunk_documents(general_docs)
racist_chunks = chunk_documents(racist_docs)

# Print the number of chunks for each document
print(f"Number of chunks in anti-religious document: {len(anti_religious_chunks)}")
print(f"Number of chunks in gender/LGBT document: {len(gender_lgbt_chunks)}")
print(f"Number of chunks in general document: {len(general_chunks)}")
print(f"Number of chunks in racist document: {len(racist_chunks)}")

# Calculate total number of chunks across all documents
total_chunks = len(general_chunks) + len(racist_chunks) + len(gender_lgbt_chunks) + len(anti_religious_chunks)
print(f"Total number of chunks: {total_chunks}")

Number of chunks in anti-religious document: 202
Number of chunks in gender/LGBT document: 52
Number of chunks in general document: 18
Number of chunks in racist document: 45
Total number of chunks: 317


In [45]:
all_chunks = anti_religious_chunks + gender_lgbt_chunks + general_chunks + racist_chunks

In [46]:
# Step 3: Create Embedding Vector Store
def create_vector_store(chunks, db_name='usafe_combined'):
    """
    Creates a vector store using HuggingFace embeddings and saves it locally.
    """
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    vector_store = FAISS.from_documents(chunks, embedding=embedding_model)
    vector_store.save_local(f"./vector_databases/{db_name}")
    print(f"Vector store '{db_name}' created and saved.")

In [47]:
# Create vector store for combined data
create_vector_store(all_chunks, db_name='usafe_combined')

Vector store 'usafe_combined' created and saved.


In [49]:
# Step 4: Load Vector Store and Create Retriever
def load_vector_store(db_path='./vector_databases/vector_db_usafe_combined'):
    """
    Loads a vector store from a local directory and returns a retriever.
    """
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    vector_store = FAISS.load_local(folder_path=db_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
    return vector_store.as_retriever()

In [50]:
# Initialize retriever for combined data
combined_retriever = load_vector_store()

In [51]:
type(combined_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

In [13]:
#with open('/Users/sayo/personal_projects/Usafe_bot/data/usafe_prompt.txt', 'r') as file:
    #user_prompt = file.read()

#print(user_prompt)

## Generation 

[`create_stuff_documents_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html#langchain.chains.combine_documents.stuff.create_stuff_documents_chain)

- takes a list of documents and formats them all into a prompt, then passes that prompt to an LLM
- passes ALL documents, so you should make sure it fits within the context window of the LLM being used

[`create_retrieval_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html#langchain.chains.retrieval.create_retrieval_chain)

- takes in a user inquiry, which is then passed to the retriever to fetch relevant documents
- those documents (and original inputs) are then passed to an LLM to generate a response

In [52]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [None]:
# Step 5: Initialize LLM
def initialize_llm(model_name="llama3-8b-8192"):
    """
    Initializes the LLM model with specified configurations.
    """
    return ChatGroq(model=model_name, temperature=0.02, max_tokens=None, timeout=None, max_retries=2)

llm = initialize_llm()


In [None]:
# Step 6: Setup Retrieval Chain
def setup_retrieval_chain(retriever):
    """
    Sets up the document chain and retrieval chain.
    """
    stuff_chain = create_stuff_documents_chain(llm=llm, prompt=hub.pull("langchain-ai/retrieval-qa-chat"))
    return create_retrieval_chain(retriever=retriever, combine_docs_chain=stuff_chain)

In [54]:
usafe_retrieval_chain = setup_retrieval_chain(combined_retriever)

In [55]:
# Step 7: Detect Hate Crime Type
def detect_hate_crime_type(inquiry, retrieval_chain=usafe_retrieval_chain):
    """
    Detects the type of hate crime based on user input.
    """
    result = retrieval_chain.invoke({"input": inquiry})
    detected_type = HATE_CRIMES_TYPE.get(result['context'][0].dict()['metadata']['source'].split('/')[-1], "Unknown")
    return detected_type

In [56]:
# Step 8: Interactive Query Function
def handle_user_query(inquiry):
    """
    Handles user query by detecting hate crime type and offering options.
    """
    detected_type = detect_hate_crime_type(inquiry)
    print(f"\nDetected Hate Crime Type: {detected_type}")

    # Present user options
    print("\nWhat information would you like to access?")
    print("1. Relevant Laws")
    print("2. Resources Available")
    print("3. Steps to Report a Crime")
    print("4. General Info")
    
    option = input("Enter your choice (1-4): ")

    # Generate query based on user choice
    query = f"I faced a {detected_type}. "
    if option == '1':
        query += "Please provide information on relevant laws."
    elif option == '2':
        query += "Please tell me about resources available."
    elif option == '3':
        query += "Please explain the steps to report a hate crime."
    elif option == '4':
        query += "Please provide some general information."
    else:
        print("Invalid option. Please try again.")
        return

    # Retrieve and print response
    response = usafe_retrieval_chain.invoke({"input": query})
    print("\nResponse:\n", response['answer'].strip("\n"))

In [None]:
handle_user_query("I was harassed because of my religion. What can I do?")


Detected Hate Crime Type: Anti-religious Hate Crime

What information would you like to access?
1. Relevant Laws
2. Resources Available
3. Steps to Report a Crime
4. General Info


In [18]:
def print_output(
    inquiry,
    retrieval_chain=react_retrieval_chain
):
    result = retrieval_chain.invoke({"input": inquiry})
    print(result['answer'].strip("\n"))

In [19]:
print_output("What is a hate crime?")

According to the provided context, a hate crime (also known as a bias crime) is a crime where a perpetrator targets a victim due to their physical appearance or perceived membership in a specific social group. These groups may include race, ethnicity, disability, language, nationality, political views, age, religion, sex, gender identity, or sexual orientation.


In [20]:
print_output("i was harassed because i'm a muslim, what shall i do?")

I'm so sorry to hear that you're experiencing harassment because of your religion. It's unacceptable and illegal. Here are some steps you can take:

1. Report the incident to the authorities: File a police report and provide as much detail as possible about the incident, including the date, time, location, and any witnesses. You can also report the incident to your local police department or the national hate crime hotline.
2. Seek support from a trusted organization: Reach out to a local Muslim organization, community center, or advocacy group that can provide you with emotional support, legal assistance, and guidance on how to navigate the situation.
3. Document the incident: Keep a record of the incident, including any evidence such as photos, videos, or witness statements. This can be helpful in case you need to take legal action.
4. Seek medical attention: If you were physically harmed or injured during the incident, seek medical attention immediately.
5. Consider seeking legal ac

In [21]:
def detect_hate_type(
    inquiry,
    retrieval_chain=react_retrieval_chain
):
    result = retrieval_chain.invoke({"input": inquiry})
    hate_type = HATE_CRIMES_TYPE[result['context'][0].dict()['metadata']['source'].split('/')[-1]]
    return hate_type


In [22]:
print(detect_hate_type("I was attacked because of my religion"))

anti-religious hate crime


In [23]:
hate_type=detect_hate_type("")

In [24]:
relevant_laws = False
resources_available = False
steps_how_to_report_crime = False 
general_info = False


In [25]:
    query = f"""
    I have been facing a hate crime of type {hate_type}.
    {'Please give me some legal advice.' * relevant_laws}
    {'Please tell me what are the local resources available.' * resources_available}
    {'Please explain the steps on how to report a hate crime.' * steps_how_to_report_crime}
    {'Please provide me with some general information.' * general_info}
    """

In [26]:
query

'\nI have been facing a hate crime of type anti-religious hate crime.\n\n\n\n\n'

In [29]:
test_query = "What should I do if I am being harassed for being Muslim?"
docs = combined_retriever.get_relevant_documents(test_query)
print("Documents Retrieved:", docs)

Documents Retrieved: [Document(metadata={'source': '/Users/sayo/personal_projects/Usafe_bot/data/anti_religious_def.pdf', 'page': 1}, page_content='-\nA\nBlack\nMuslim\nwoman\nwas\nsubjected\nto\nracist\nand\nanti-Muslim\nthreats\nand\ninsults\non\na\ntrain.\n-\nA\nmale\nMuslim\nactivist\nreceived\ndeath\nthreats\nvia\nemail.\nThis\nwas\none\nin\na\nseries\nof\nsimilar\nincidents.\n-\nA\nmale\nMuslim\nactivist\nreceived\na\nletter\ncontaining\nanti-Muslim\nand\nxenophobic\ninsults\nand\nthreats,\nas\nwell\nas\nNazi\nsymbols.\nThis\nwas\none\nin\na\nseries\nof\nsimilar\nincidents.\n-\nA\nmale\nMuslim\nactivist\nwas\nrepeatedly\nthreatened\non\nTwitter.\nThis\nwas\none\nin\na\nseries\nof\nsimilar\nincidents.\n-\nA\nMuslim\nwoman\nwas\nsubjected\nto\nanti-Muslim\ninsults,\nthreatened\nwith\na\nknife\nand\nphysically\nassaulted\nby\na\nperpetrator\nwho\nattempted\nto\nremove\nher\nheadscarf.\n-\nA\nMuslim-owned\nshop\nwas\nvandalized\nand\na\nwindow\nwas\nshattered\nwhen\nreligious\nmusic\