In [13]:
# Import necessary libraries
import warnings
from dotenv import load_dotenv
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_groq import ChatGroq

In [14]:
# Load environment variables
load_dotenv()

True

In [15]:
# Suppress warnings
warnings.filterwarnings("ignore")

In [19]:
import re
from langchain.schema import Document
from langchain_community.document_loaders import PyPDFLoader

def clean_text(text):
    """
    Clean extracted text by removing unnecessary characters and normalizing spaces.
    """
    text = text.strip()  # Remove leading and trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.encode('ascii', 'ignore').decode()  # Remove non-ASCII characters
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters and punctuation
    return text

def extract_text_from_pdf_with_metadata(pdf_path):
    """
    Extract text from a PDF, clean it, and tag sections based on keywords.
    """
    loader = PyPDFLoader(file_path=pdf_path)
    documents = loader.load()
    processed_documents = []

    for doc in documents:
        cleaned_text = clean_text(doc.page_content)
        
        # Adding metadata based on keywords in the cleaned text
        if "german criminal code" in cleaned_text or "stgb" in cleaned_text or \
           "equal treatment act" in cleaned_text or "grundgesetz" in cleaned_text or \
           "icerd" in cleaned_text or "article 3" in cleaned_text or "article 4" in cleaned_text:
            section = "laws"
        
        elif "steps to report" in cleaned_text or "how to report" in cleaned_text or \
             "document the incident" in cleaned_text or "preserve digital evidence" in cleaned_text or \
             "language barriers" in cleaned_text or "visit police station" in cleaned_text or \
             "report online" in cleaned_text or "seek additional support" in cleaned_text:
            section = "reporting_steps"
        
        elif "berlin" in cleaned_text or "local resources" in cleaned_text or \
             "online strafanzeige" in cleaned_text or "meldestelle respect" in cleaned_text or \
             "antidiskriminierungsstelle" in cleaned_text or "hateaid" in cleaned_text or \
             "roots berlin" in cleaned_text or "kop berlin" in cleaned_text or \
             "vbrg" in cleaned_text or "gladt" in cleaned_text or "hydra" in cleaned_text or \
             "lesmigras" in cleaned_text:
            section = "berlin_resources"
        
        elif "hate crime" in cleaned_text or "bias crime" in cleaned_text or \
             "history of hate crimes" in cleaned_text or "psychological effects" in cleaned_text or \
             "motivation behind hate crimes" in cleaned_text or "thrill-seeking" in cleaned_text or \
             "self-control theory" in cleaned_text or "violence risk appraisal" in cleaned_text or \
             "psychopathy checklist" in cleaned_text or "recidivism" in cleaned_text:
            section = "general_info"
        
        else:
            section = "general"
        
        # Append the cleaned text with metadata
        processed_documents.append(
            Document(
                page_content=cleaned_text,
                metadata={"source": pdf_path, "section": section, "page": doc.metadata.get("page", 0)}
            )
        )
    
    return processed_documents

In [20]:
# Load the general PDF with extracted text
general_docs = extract_text_from_pdf_with_metadata('/Users/sayo/personal_projects/Usafe_bot/data/general_one.pdf')

# Display the first few entries for review
for doc in general_docs[:2]:
    print(f"Section: {doc.metadata['section']}, Page: {doc.metadata['page']}")
    print(doc.page_content)
    print("-" * 50)

Section: general_info, Page: 0
1 general information hate crime definition a hate crime also known as a bias crime is a crime where a perpetrator targets a victim due to their physical appearance or perceived membership in a specific social group such groups may include race ethnicity  disability  language nationality  political views age religion sex gender identity  or sexual orientation noncriminal actions motivated by these biases are often termed bias incidents for example hate crimes may include physical assault homicide damage to property  bullying harassment verbal abuse offensive graffiti or hate mail history of hate crimes the term hate crime gained common usage in the us during the 1980s although similar crimes have historical roots historical examples include the roman persecution of christians the nazi genocide of jews and european colonial violence against indigenous peoples in the us lynching of african americans cross burnings and attacks on minority ethnic and lgbtq co

In [21]:
# Structured data definitions with sections aligned to extraction logic
structured_data_docs = [
    # Definitions related to hate crimes (General Info)
    Document(
        page_content="A hate crime is a crime where a perpetrator targets a victim due to perceived membership in a specific group.",
        metadata={"source": "definitions", "section": "general_info"}
    ),
    Document(
        page_content="Hate crimes cause trauma, depression, and fear among targeted groups.",
        metadata={"source": "psychological_effects", "section": "general_info"}
    ),
    Document(
        page_content="The term ‘hate crime’ became popular in the 1980s.",
        metadata={"source": "history", "section": "general_info"}
    ),
    Document(
        page_content="Motivations behind hate crimes include thrill-seeking, revenge, or protecting one's community.",
        metadata={"source": "motivation", "section": "general_info"}
    ),
    
    # Legal information (Laws)
    Document(
        page_content="The German Criminal Code includes sections addressing hate crimes under 'StGB'.",
        metadata={"source": "german_laws", "section": "laws"}
    ),
    Document(
        page_content="The Equal Treatment Act ensures protection against discrimination in Germany.",
        metadata={"source": "equal_treatment_act", "section": "laws"}
    ),
    
    # Reporting steps (Reporting Steps)
    Document(
        page_content="To report a hate crime, document the incident, preserve digital evidence, and contact authorities.",
        metadata={"source": "reporting_guidelines", "section": "reporting_steps"}
    ),
    Document(
        page_content="Ensure to document any language barriers when reporting hate crimes to the authorities.",
        metadata={"source": "language_barriers", "section": "reporting_steps"}
    ),

    # Berlin-specific resources (Berlin Resources)
    Document(
        page_content="HateAid offers support to hate crime victims in Berlin.",
        metadata={"source": "hateaid", "section": "berlin_resources"}
    ),
    Document(
        page_content="The Antidiskriminierungsstelle provides resources and support for discrimination cases in Berlin.",
        metadata={"source": "antidiskriminierungsstelle", "section": "berlin_resources"}
    ),
    Document(
        page_content="Online Strafanzeige allows for online reporting of hate crimes in Berlin.",
        metadata={"source": "online_strafanzeige", "section": "berlin_resources"}
    )
]

In [22]:
all_documents = general_docs + structured_data_docs

In [23]:
for doc in structured_data_docs:
    print(f"Content: {doc.page_content[:50]}, Source: {doc.metadata['source']}")

Content: A hate crime is a crime where a perpetrator target, Source: definitions
Content: Hate crimes cause trauma, depression, and fear amo, Source: psychological_effects
Content: The term ‘hate crime’ became popular in the 1980s., Source: history
Content: Motivations behind hate crimes include thrill-seek, Source: motivation
Content: The German Criminal Code includes sections address, Source: german_laws
Content: The Equal Treatment Act ensures protection against, Source: equal_treatment_act
Content: To report a hate crime, document the incident, pre, Source: reporting_guidelines
Content: Ensure to document any language barriers when repo, Source: language_barriers
Content: HateAid offers support to hate crime victims in Be, Source: hateaid
Content: The Antidiskriminierungsstelle provides resources , Source: antidiskriminierungsstelle
Content: Online Strafanzeige allows for online reporting of, Source: online_strafanzeige


In [24]:
# Step 2: Document Chunking
def chunk_documents(documents, chunk_size=512, chunk_overlap=20):
    """
    Splits documents into smaller chunks for embedding.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(documents=documents)

In [25]:
# Step 2: Chunk Documents
general_chunks = chunk_documents(general_docs)
print(f"Chunks created from general PDF: {len(general_chunks)}")

# Chunk the structured data documents
structured_chunks = chunk_documents(structured_data_docs)
print(f"Chunks created from structured data: {len(structured_chunks)}")

# Combine general chunks with structured data chunks
general_chunks += structured_chunks
print(f"Total chunks after adding structured data: {len(general_chunks)}")

Chunks created from general PDF: 21
Chunks created from structured data: 11
Total chunks after adding structured data: 32


In [26]:
# Step 3: Embedding and Vector Store Creation
def create_vector_store(general_chunks, data_base_name='usafe_general'):
    """
    Creates a vector store using HuggingFace embeddings and saves it locally.
    """
    embeddings_model = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    vector_store = FAISS.from_documents(general_chunks, embedding=embeddings_model)
    vector_store.save_local(f"./vector_databases/{data_base_name}")

In [27]:
# Create the vector store specifically for the general information
create_vector_store(general_chunks, data_base_name='usafe_general')

In [14]:
# Step 4: Vector Store Retrieval
def load_vector_store(data_base_name='./vector_databases/usafe_general'):
    """
    Loads a vector store from a specified path and returns a retriever.
    """
    embeddings_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    vector_store = FAISS.load_local(folder_path=data_base_name, embeddings=embeddings_model, allow_dangerous_deserialization=True)
    return vector_store.as_retriever(search_kwargs={'k': 5})

In [15]:
# Initialize retriever for general information
general_info_retriever = load_vector_store('./vector_databases/usafe_general')

In [16]:
type(general_info_retriever)

langchain_core.vectorstores.base.VectorStoreRetriever

In [13]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [14]:
# Step 5: LLM and Chain Connection
def initialize_llm(model_name="llama3-8b-8192"):
    """
    Initializes the LLM model with specified configurations.
    """
    return ChatGroq(
        model=model_name,
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2
    )

In [15]:
# Load LLM
llm = initialize_llm()

def setup_retrieval_chain(retriever):
    """
    Sets up the document chain and retrieval chain.
    """
    stuff_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    return create_retrieval_chain(retriever=retriever, combine_docs_chain=stuff_chain)

In [16]:
usafe_retrieval_chain = setup_retrieval_chain(general_info_retriever)

In [17]:
def query_usafe_bot(user_input, retrieval_chain=usafe_retrieval_chain):
    """
    Queries the Usafe ChatBot with a user's input and returns the response.
    """
    # Query the retrieval chain
    response = retrieval_chain.invoke({"input": user_input})
    
    # Print the response for debugging
    print(f"Response:\n{response['answer'].strip()}")
    
    # Return the response for further use
    return response['answer']

In [18]:
query_usafe_bot("What are the steps to report a hate crime? and what local resources are available in Germany?")

Response:
According to the context, the steps to report a hate crime are:

1. Bring someone who can assist with translation when reporting the incident at a police station.
2. Visit Your Local Police Station: Bring all collected documentation and explain the details to the officer, emphasizing that it was a hate crime. Request a case reference number for future follow-ups.
3. Report Online: If unable to visit a police station, you can file a report online via local authorities' websites, such as the Berlin Police Online Reporting Portal.

As for local resources available in Germany, the context mentions the following:

1. Online Strafanzeige platform: allows individuals to file criminal complaints online, including hate crimes.
2. Meldestelle Respect!: offers a platform to report hate speech and receive expert analysis.
3. Antidiskriminierungsstelle des Bundes (Federal Anti-Discrimination Agency): provides information and support for victims of hate crimes.
4. Get The Trolls Out!: addr

"According to the context, the steps to report a hate crime are:\n\n1. Bring someone who can assist with translation when reporting the incident at a police station.\n2. Visit Your Local Police Station: Bring all collected documentation and explain the details to the officer, emphasizing that it was a hate crime. Request a case reference number for future follow-ups.\n3. Report Online: If unable to visit a police station, you can file a report online via local authorities' websites, such as the Berlin Police Online Reporting Portal.\n\nAs for local resources available in Germany, the context mentions the following:\n\n1. Online Strafanzeige platform: allows individuals to file criminal complaints online, including hate crimes.\n2. Meldestelle Respect!: offers a platform to report hate speech and receive expert analysis.\n3. Antidiskriminierungsstelle des Bundes (Federal Anti-Discrimination Agency): provides information and support for victims of hate crimes.\n4. Get The Trolls Out!: ad

In [19]:
query_usafe_bot("What is the definition of a hate crime?")

Response:
According to the provided context, a hate crime (also known as a bias crime) is a crime where a perpetrator targets a victim due to their physical appearance or perceived membership in a specific social group.


'According to the provided context, a hate crime (also known as a bias crime) is a crime where a perpetrator targets a victim due to their physical appearance or perceived membership in a specific social group.'

In [20]:
query_usafe_bot("What are the steps i can follow to get resources to help me feel better after a hate crime?")

Response:
Based on the context, here are the steps you can follow to get resources to help you feel better after a hate crime:

1. Document the incident: Record as much information as possible about the incident, including dates, times, locations, and details of what happened.
2. Gather evidence: Collect any physical evidence, such as photos, videos, or witness statements, that can help support your report.
3. Prepare for language barriers: If you're not fluent in German, be prepared to seek translation support from organizations like ReachOut Berlin.
4. Contact local law enforcement: Report the incident to the police and provide them with the information and evidence you've gathered.
5. Reach out to support organizations: Contact organizations like the Agency (antidiskriminierungsstelle.de) for counseling and support, and You Are Not Alone (youarenotalone.ai) for mental health resources and coping strategies.
6. Seek additional support: If needed, reach out to organizations like Reach

"Based on the context, here are the steps you can follow to get resources to help you feel better after a hate crime:\n\n1. Document the incident: Record as much information as possible about the incident, including dates, times, locations, and details of what happened.\n2. Gather evidence: Collect any physical evidence, such as photos, videos, or witness statements, that can help support your report.\n3. Prepare for language barriers: If you're not fluent in German, be prepared to seek translation support from organizations like ReachOut Berlin.\n4. Contact local law enforcement: Report the incident to the police and provide them with the information and evidence you've gathered.\n5. Reach out to support organizations: Contact organizations like the Agency (antidiskriminierungsstelle.de) for counseling and support, and You Are Not Alone (youarenotalone.ai) for mental health resources and coping strategies.\n6. Seek additional support: If needed, reach out to organizations like ReachOu

In [21]:
query_usafe_bot("I faced a hate crime and I need help with what to do next.")

Response:
I'm so sorry to hear that you've experienced a hate crime. It's important to prioritize your safety and well-being during this time. Here are some steps you can take:

1. Document the incident: Write down as many details as you can remember about the incident, including what happened, when it happened, and where it happened. Take photos or gather any physical evidence.
2. Gather evidence: Collect any physical evidence that may be relevant to the incident, such as witness statements, videos, or audio recordings.
3. Prepare for language barriers: If you're not fluent in the language spoken by the police or other authorities, consider bringing someone who can assist with translation.
4. Report the incident: You can report the incident to the police at a station or online through the local authorities' reporting platform. Make sure to explain that it was a hate crime and request a case reference number.
5. Seek support: Reach out to organizations that provide support for hate cri

"I'm so sorry to hear that you've experienced a hate crime. It's important to prioritize your safety and well-being during this time. Here are some steps you can take:\n\n1. Document the incident: Write down as many details as you can remember about the incident, including what happened, when it happened, and where it happened. Take photos or gather any physical evidence.\n2. Gather evidence: Collect any physical evidence that may be relevant to the incident, such as witness statements, videos, or audio recordings.\n3. Prepare for language barriers: If you're not fluent in the language spoken by the police or other authorities, consider bringing someone who can assist with translation.\n4. Report the incident: You can report the incident to the police at a station or online through the local authorities' reporting platform. Make sure to explain that it was a hate crime and request a case reference number.\n5. Seek support: Reach out to organizations that provide support for hate crime 

In [22]:
query_usafe_bot("what are the psychological effects of a hate crime on the victim?")

Response:
According to the context, the psychological effects of a hate crime on the victim may include:

1. Trauma
2. Depression
3. Low self-esteem
4. Increased fear and vulnerability


'According to the context, the psychological effects of a hate crime on the victim may include:\n\n1. Trauma\n2. Depression\n3. Low self-esteem\n4. Increased fear and vulnerability'

In [23]:
query_usafe_bot("What are the motivations behind hate crimes?")

Response:
According to the context, the primary motives behind hate crimes, as identified by sociologists like Jack McDevitt and Jack Levin, include:

1. Thrill-seeking (committed for excitement, often by groups targeting vulnerable individuals)
2. Defensive motives (protecting one's community from perceived threats)
3. Retaliatory motives (revenge for perceived offenses)
4. Mission-oriented crimes (ideologically motivated, often targeting symbolically significant sites)

Additionally, the Self-Control Theory suggests that social, cultural, and individual factors contribute to these biases, and offenders often develop biases through social interactions and cultural influences.


"According to the context, the primary motives behind hate crimes, as identified by sociologists like Jack McDevitt and Jack Levin, include:\n\n1. Thrill-seeking (committed for excitement, often by groups targeting vulnerable individuals)\n2. Defensive motives (protecting one's community from perceived threats)\n3. Retaliatory motives (revenge for perceived offenses)\n4. Mission-oriented crimes (ideologically motivated, often targeting symbolically significant sites)\n\nAdditionally, the Self-Control Theory suggests that social, cultural, and individual factors contribute to these biases, and offenders often develop biases through social interactions and cultural influences."