In [1]:
# Import necessary libraries
import warnings
from dotenv import load_dotenv
import pdfplumber
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_groq import ChatGroq

In [2]:
# Load environment variables
load_dotenv()
warnings.filterwarnings("ignore")

In [3]:
# Define the hate crimes types

HATE_CRIMES_TYPE = {
    'anti_religious_def.pdf': 'Anti-religious Hate Crime',
    'racist_def.pdf': 'Racist and Xenophobic Hate Crime',
    'gender_lgbt_def.pdf': 'Gender and LGBTQ+ Hate Crime'
}

In [4]:
# Step 1: Load PDFs using pdfplumber
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF using pdfplumber and returns a list of Document objects.
    """
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                extracted_text += page_text + "\n"
    return [Document(page_content=extracted_text, metadata={"source": pdf_path})]

In [5]:
# Load PDFs for each category
anti_religious_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/anti_religious_def.pdf')
gender_lgbt_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/gender_lgbt_def.pdf')
general_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/general.pdf')
racist_docs = extract_text_from_pdf('/Users/sayo/personal_projects/Usafe_bot/data/racist_def.pdf')

In [6]:
# Step 2: Chunk Documents
def chunk_documents(documents, chunk_size=800, chunk_overlap=80):
    """
    Splits documents into smaller chunks for embedding.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)


In [7]:
# Chunk each document category
anti_religious_chunks = chunk_documents(anti_religious_docs)
gender_lgbt_chunks = chunk_documents(gender_lgbt_docs)
general_chunks = chunk_documents(general_docs)
racist_chunks = chunk_documents(racist_docs)

# Print the number of chunks for each document
print(f"Number of chunks in anti-religious document: {len(anti_religious_chunks)}")
print(f"Number of chunks in gender/LGBT document: {len(gender_lgbt_chunks)}")
print(f"Number of chunks in general document: {len(general_chunks)}")
print(f"Number of chunks in racist document: {len(racist_chunks)}")

# Calculate total number of chunks across all documents
total_chunks = len(general_chunks) + len(racist_chunks) + len(gender_lgbt_chunks) + len(anti_religious_chunks)
print(f"Total number of chunks: {total_chunks}")

Number of chunks in anti-religious document: 202
Number of chunks in gender/LGBT document: 52
Number of chunks in general document: 18
Number of chunks in racist document: 45
Total number of chunks: 317


In [8]:
all_chunks = anti_religious_chunks + gender_lgbt_chunks + general_chunks + racist_chunks

In [9]:
# Step 3: Create Embedding Vector Store
def create_vector_store(chunks, db_name='usafe_combined'):
    """
    Creates a vector store using HuggingFace embeddings and saves it locally.
    """
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    vector_store = FAISS.from_documents(chunks, embedding=embedding_model)
    vector_store.save_local(f"./vector_databases/{db_name}")
    print(f"Vector store '{db_name}' created and saved.")

In [10]:
# Create vector store for combined data
create_vector_store(all_chunks, db_name='usafe_combined')

Vector store 'usafe_combined' created and saved.


In [11]:
# Step 4: Load Vector Store and Create Retriever
def load_vector_store(db_path='./vector_databases/vector_db_usafe_combined'):
    """
    Loads a vector store from a local directory and returns a retriever.
    """
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    vector_store = FAISS.load_local(folder_path=db_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
    return vector_store.as_retriever()

In [12]:
# Initialize retriever for combined data
combined_retriever = load_vector_store()

In [13]:
type(combined_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

In [14]:
#with open('/Users/sayo/personal_projects/Usafe_bot/data/usafe_prompt.txt', 'r') as file:
    #user_prompt = file.read()

#print(user_prompt)

## Generation 

[`create_stuff_documents_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.create_stuff_documents_chain.html#langchain.chains.combine_documents.stuff.create_stuff_documents_chain)

- takes a list of documents and formats them all into a prompt, then passes that prompt to an LLM
- passes ALL documents, so you should make sure it fits within the context window of the LLM being used

[`create_retrieval_chain`](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval.create_retrieval_chain.html#langchain.chains.retrieval.create_retrieval_chain)

- takes in a user inquiry, which is then passed to the retriever to fetch relevant documents
- those documents (and original inputs) are then passed to an LLM to generate a response

In [15]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [16]:
# Step 5: Initialize LLM
def initialize_llm(model_name="llama3-8b-8192"):
    """
    Initializes the LLM model with specified configurations.
    """
    return ChatGroq(model=model_name, temperature=0.02, max_tokens=None, timeout=None, max_retries=2)

llm = initialize_llm()


In [17]:
# Step 6: Setup Retrieval Chain
def setup_retrieval_chain(retriever):
    """
    Sets up the document chain and retrieval chain.
    """
    stuff_chain = create_stuff_documents_chain(llm=llm, prompt=hub.pull("langchain-ai/retrieval-qa-chat"))
    return create_retrieval_chain(retriever=retriever, combine_docs_chain=stuff_chain)

In [18]:
usafe_retrieval_chain = setup_retrieval_chain(combined_retriever)

In [25]:
# Step 7: Detect Hate Crime Type
def detect_hate_crime_type(inquiry, retrieval_chain=usafe_retrieval_chain):
    """
    Detects the type of hate crime based on user input
    """
    result = retrieval_chain.invoke({"input": inquiry})
    detected_type = HATE_CRIMES_TYPE.get(result['context'][0].dict()['metadata']['source'].split('/')[-1], "Unknown")
    return detected_type

In [32]:
print(detect_hate_crime_type("I was attacked because of my ethnicity."))

Racist and Xenophobic Hate Crime


In [42]:
# Step 8: Interactive Query Function
def handle_user_query(inquiry):
    """
    Handles user query by detecting hate crime type and offering options.
    """
    detected_type = detect_hate_crime_type(inquiry)
    print(f"\nDetected Hate Crime Type: {detected_type}")

    # Present user options
    print("\nWhat information would you like to access?")
    print("1. Relevant Laws Germany")
    print("2. Local Resources and Support")
    print("3. Steps to Report a Crime in Germany")
    print("4. Generic Information")
    
    option = input("Enter your choice (1-4): ")

    # using the option selected by the user to query the pdf
    pdf_query = ""
    if option == '1':
        pdf_query = "Relevant laws related to hate crimes in Germany"
    elif option == '2':
        pdf_query = "Local resources: NGOs, Legal Aid, Counseling, etc to support hate crime victims"
    elif option == '3':
        pdf_query = "Steps on how to report a hate crime in Germany"
    elif option == '4':
        pdf_query = "General information on hate crimes, psychological effects, and resources"
    else:
        print("Invalid option. Please try again.")
        return
    
     # Add metadata to specify the PDF file you want to query
    metadata_filter = {"pdf_name": "general.pdf"}

    # Retrieve and print response from the vector store
    response = usafe_retrieval_chain.invoke({"input": pdf_query})
    print("\nResponse:\n", response['answer'].strip("\n"))

In [43]:
handle_user_query("I was attacked because by nazi group")


Detected Hate Crime Type: Anti-religious Hate Crime

What information would you like to access?
1. Relevant Laws Germany
2. Local Resources and Support
3. Steps to Report a Crime in Germany
4. Generic Information

Response:
 Based on the provided context, here are some local resources that can support hate crime victims in Germany:

1. NGOs:
	* ReachOut Berlin: Offers assistance for individuals facing hate crime incidents. Contact: info@reachoutberlin.de, Address: Oranienburger Str. 27, 10117 Berlin
2. Legal Aid:
	* Antidiskriminierungsstelle des Bundes (Federal Anti-Discrimination Agency): Offers counseling and support for those facing discrimination, including hate crimes. Contact: [insert contact information]
3. Counseling:
	* Antidiskriminierungsstelle des Bundes (Federal Anti-Discrimination Agency): Provides information on rights and connects individuals with local support. Contact: [insert contact information]
4. Online Reporting Platforms:
	* Online Strafanzeige (Online Crimina

In [39]:
# Step 8: Interactive Query Function (No Response Retrieval)
def handle_user_query(inquiry):
    """
    Handles user query by detecting hate crime type and offering options.
    """
    detected_type = detect_hate_crime_type(inquiry)
    print(f"\nDetected Hate Crime Type: {detected_type}")

    # Present user options
    print("\nWhat information would you like to access?")
    print("1. Relevant Laws Germany")
    print("2. Local Resources and Support")
    print("3. Steps to Report a Crime in Germany")
    print("4. Generic Information")
    
    option = input("Enter your choice (1-4): ")

    # Confirm the selected option without invoking a response
    if option == '1':
        print("\nYou selected: Relevant Laws Germany.")
    elif option == '2':
        print("\nYou selected: Local Resources and Support.")
    elif option == '3':
        print("\nYou selected: Steps to Report a Crime in Germany.")
    elif option == '4':
        print("\nYou selected: Generic Information.")
    else:
        print("Invalid option. Please try again.")

In [40]:
handle_user_query("I was attacked because of my sexual orientation. What can I do?")


Detected Hate Crime Type: Gender and LGBTQ+ Hate Crime

What information would you like to access?
1. Relevant Laws Germany
2. Local Resources and Support
3. Steps to Report a Crime in Germany
4. Generic Information

You selected: Local Resources and Support.


In [None]:
def detect_hate_type(
    inquiry,
    retrieval_chain=react_retrieval_chain
):
    result = retrieval_chain.invoke({"input": inquiry})
    hate_type = HATE_CRIMES_TYPE[result['context'][0].dict()['metadata']['source'].split('/')[-1]]
    return hate_type


In [None]:
print(detect_hate_type("I was attacked because of my religion"))

anti-religious hate crime


In [None]:
hate_type=detect_hate_type("")

In [None]:
relevant_laws = False
resources_available = False
steps_how_to_report_crime = False 
general_info = False


In [None]:
    query = f"""
    I have been facing a hate crime of type {hate_type}.
    {'Please give me some legal advice.' * relevant_laws}
    {'Please tell me what are the local resources available.' * resources_available}
    {'Please explain the steps on how to report a hate crime.' * steps_how_to_report_crime}
    {'Please provide me with some general information.' * general_info}
    """

In [None]:
query

'\nI have been facing a hate crime of type anti-religious hate crime.\n\n\n\n\n'

In [None]:
test_query = "What should I do if I am being harassed for being Muslim?"
docs = combined_retriever.get_relevant_documents(test_query)
print("Documents Retrieved:", docs)

Documents Retrieved: [Document(metadata={'source': '/Users/sayo/personal_projects/Usafe_bot/data/anti_religious_def.pdf', 'page': 1}, page_content='-\nA\nBlack\nMuslim\nwoman\nwas\nsubjected\nto\nracist\nand\nanti-Muslim\nthreats\nand\ninsults\non\na\ntrain.\n-\nA\nmale\nMuslim\nactivist\nreceived\ndeath\nthreats\nvia\nemail.\nThis\nwas\none\nin\na\nseries\nof\nsimilar\nincidents.\n-\nA\nmale\nMuslim\nactivist\nreceived\na\nletter\ncontaining\nanti-Muslim\nand\nxenophobic\ninsults\nand\nthreats,\nas\nwell\nas\nNazi\nsymbols.\nThis\nwas\none\nin\na\nseries\nof\nsimilar\nincidents.\n-\nA\nmale\nMuslim\nactivist\nwas\nrepeatedly\nthreatened\non\nTwitter.\nThis\nwas\none\nin\na\nseries\nof\nsimilar\nincidents.\n-\nA\nMuslim\nwoman\nwas\nsubjected\nto\nanti-Muslim\ninsults,\nthreatened\nwith\na\nknife\nand\nphysically\nassaulted\nby\na\nperpetrator\nwho\nattempted\nto\nremove\nher\nheadscarf.\n-\nA\nMuslim-owned\nshop\nwas\nvandalized\nand\na\nwindow\nwas\nshattered\nwhen\nreligious\nmusic\