# Initialize

In [2]:
# install dependencies
!pip install sentence_transformers
!pip install qdrant_client
!pip install einops

from IPython.display import clear_output
clear_output()

In [3]:
# import libraries
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, Batch
from collections import Counter
import requests
import json
import os
from tqdm import tqdm
import pandas as pd
import json



# LLM for Explaination Generation

## Configure LLM

In [4]:
# Define LLM inference function to use later
API_URL = "https://inf.cl.uni-trier.de/chat/"


def llm(model_name, system_prompt, input_query, rag_context):
    rag_context = f"Additional Context: {rag_context}"
    # Construct the request payload
    payload = {
        "messages": [
            {"content": system_prompt, "role": "system"},
            {"content": rag_context, "role": "system"},
            {"content": input_query, "role": "user"}
        ],
        "model": model_name,
        "options": {}
    }

    # Set the request headers
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }

    # Send the POST request
    response = requests.post(API_URL, headers=headers,
                             data=json.dumps(payload))

    # Process the response
    if response.status_code == 200:
        print("Response received successfully:")
        response = response.json()  # json.dumps(, indent=4)
    else:
        print(
            f"Failed to retrieve response. Status code: {response.status_code}")

    return response

Here we make experiments with system prompt,

System prompt contains : what model is supposed to do with the input query. with an exmple output.
Input Query contains, one row of dataset.
explaination variable will have the output of model which will be an explaination.

In [10]:
# # Testing LLM
# system_prompt = "You are a helpful assistant."
# input_query = "Hi"
# model_name = "llama3.3:70b-instruct-q6_K"

In [11]:
# %%time

# response = llm(model_name, system_prompt, input_query)
# explaination = response['response']
# print(explaination)

Response received successfully:
Hello! How can I assist you today? Do you have any questions, need help with something, or just want to chat? I'm here to help!
CPU times: user 99.4 ms, sys: 27.4 ms, total: 127 ms
Wall time: 23 s


In [None]:
# a[1]

# RAG

## Load Embeddings model

If you are just experimenting with explainations and here to run LLM. then no need to run the cells in this section.

Bellow there are 3 cells with 3 different type of models. uncomment the one suits best for you

In [12]:
# RUNS superfast on CPU, Bad Results, good for old or weak laptop cpus, speed up testing

# EMBEDDINGS_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2', trust_remote_code=True)

In [13]:
# # RUNS superfast on CPU too, Good Results, works well on laptop with good cpu and laptop without GPU.

# EMBEDDINGS_MODEL = SentenceTransformer(
#     'jxm/cde-small-v1', trust_remote_code=True, device='cuda').to('cuda')

In [2]:
import torch
from sentence_transformers import SentenceTransformer


device = "cuda" if torch.cuda.is_available() else "cpu"

EMBEDDINGS_MODEL = SentenceTransformer(
    "dunzhang/stella_en_1.5B_v5",
    trust_remote_code=True,
    device=device
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
%%time

# dimension
len(EMBEDDINGS_MODEL.encode(""))

CPU times: user 538 ms, sys: 1.89 ms, total: 540 ms
Wall time: 539 ms


1024

## Configure Vector DB

In [None]:
# https://7ef18c4d-2ef6-4fb0-9243-0ac62546593c.us-east4-0.gcp.cloud.qdrant.io:6333/dashboard#/collections

In [5]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://7ef18c4d-2ef6-4fb0-9243-0ac62546593c.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key="BR8zsNr5lEYrqJPL4EknUj2oRska2JO1nHwPFawlFMqZIrYMuGZ0Wg",
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='HateXplain_8129'), CollectionDescription(name='HateXplain_index_2'), CollectionDescription(name='explainations_nimora_test'), CollectionDescription(name='HateXplain_index'), CollectionDescription(name='HateXplain_index_3'), CollectionDescription(name='HateXplain_gpu_stella_0'), CollectionDescription(name='HateXplain_gpu_nilo_0'), CollectionDescription(name='test_index'), CollectionDescription(name='HateXplain_index_1'), CollectionDescription(name='HateXplain_gpu_usama_0'), CollectionDescription(name='HateXplain_gpu_nilo_1'), CollectionDescription(name='test_index_'), CollectionDescription(name='HateXplain_index_4')]


In [6]:
# docker only
# qdrant_client = QdrantClient(location='127.0.0.1', port=6333)

You can browse the collections/indexes here:

https://7ef18c4d-2ef6-4fb0-9243-0ac62546593c.us-east4-0.gcp.cloud.qdrant.io:6333/dashboard#/collections

and and enter the API: BR8zsNr5lEYrqJPL4EknUj2oRska2JO1nHwPFawlFMqZIrYMuGZ0Wg

## Deploy | Upload dataset on RAG

In [8]:
index_name = "HateXplain_gpu_stella_0"

## Query RAG

In [21]:
def fetch_explainations_rag(user_input, top_k):

    embedding = EMBEDDINGS_MODEL.encode(user_input)
    search_result = qdrant_client.search(
        collection_name=index_name, query_vector=embedding, limit=top_k
    )

    result = [
        {
            "text": item.payload["tweet_text"],
            "explanation": item.payload["Response"],
            "key_features": item.payload["key_features"],
            # "score": item.score,
            "label": item.payload["label"]
        }
        for item in search_result
    ]
    return result


# INFERENCE

## Userinput + RAG output - > LLM -> classification

In [32]:
system_prompt_classification = """
Lets classify the following text for hate speech.

if its hatespeech your reply will be: "hate speech"
if its not hatespeech your reply will be : "normal"
if its offensive and not hatespeech, your reply will be : "offensive"

You are also given some related examples and their decision along with explainations why the decision was took. If those examples help make an unbiast decision contextually.
Focus more on userinput.


Here is user input and reply with only one word ONLY such as [hate speech, normal, or offensive] """

In [41]:
%%time
model_name = "deepseek-r1:70b"
user_input = "I might rape console players tonight"
top_k = 2  # number of explainations fetched from vector db


user_input = "Userinput:" + user_input
print(f"######################################### \nUserInput: {user_input}")

rag_results = fetch_explainations_rag(user_input, top_k)
rag_results_str = json.dumps(rag_results, indent=4)

print(f"\n######################################### \nRAG Results:")
print(rag_results_str)

response = llm(model_name, system_prompt_classification, user_input, rag_results_str)
llm_classification_output = response['response']
print(f"\n######################################### \nLLM Classification output: \n {llm_classification_output}")

######################################### 
UserInput: Userinput:I might rape console players tonight


  search_result = qdrant_client.search(



######################################### 
RAG Results:
[
    {
        "text": "i ate dan because it raped me on minecraft",
        "explanation": "The provided tweet text is classified as neutral due to the following reasons:\n\n1. The tweet's content does not contain any overt hate speech or directly offensive language.\n\n2. While references to sexual assault and violence are present (\"it raped me\"), they appear to be used in a non-literal context related to the game Minecraft, where \"rape\" can colloquially mean domination or defeat within the gaming community.\n\n3. The tweet does not target any specific group or individual, nor does it promote harm or discrimination against anyone.\n\n4. Given the lack of clear malicious intent and the absence of direct targeting or inflammatory language, this tweet is best categorized as neutral, despite its potentially disturbing content when taken out of context.",
        "key_features": "[]",
        "label": "normal"
    },
    {
    

## userinput + classification  -> Explaination or decision

In [7]:
system_prompt_explaination = """
You are a helpful chatbot, which acts normal when the label is normal, but when you get userinput along with label as offensive or hatespeech. you have to tell the user why its hatespeech or offensive.
And tell the user what he should tell, help him learn and avoid

You are given userinput, and the label bellow: """

In [11]:
Dena_system_prompt_explaination_1 = """
You are a helpful and empathetic chatbot programmed to assist users in understanding and improving their communication. 
Your task is to respond to user inputs based on their classification: normal, offensive, or hate speech. 
Here is how you should react:

- If the input is classified as "normal", acknowledge the message positively and encourage further interaction.
- If the input is classified as "offensive", explain why the content might be offensive, flag it, and offer guidance on how to communicate more respectfully.
- If the input is classified as "hate speech", explain the harmful impact of such speech, flag and report the content, and provide information on the importance of maintaining a respectful and inclusive dialogue.

Always respond with the aim to educate and promote a respectful dialogue. Provide clear explanations and offer support where needed.
: """

In [12]:
%%time
model_name = "deepseek-r1:70b"
user_input = "I might rape console players tonight"
top_k = 2  # number of explainations fetched from vector db


user_input = "Userinput:" + user_input
print(f"######################################### \nUserInput: {user_input}")

classified_label = "offensive"
response = llm(model_name, Dena_system_prompt_explaination_1, user_input, classified_label)
llm_output = response['response']
print(f"\n######################################### \nLLM final output for user: \n {llm_output}")

######################################### 
UserInput: Userinput:I might rape console players tonight
Response received successfully:

######################################### 
LLM final output for user: 
 <think>
Alright, I need to figure out how to handle this user input. The user wrote, "I might rape console players tonight." First, I should classify the message. The word "rape" here is being used in a threatening way towards a group of people based on their gaming preferences. That definitely falls under hate speech because it's promoting violence against others.

Now, according to the guidelines, when dealing with hate speech, I need to explain why it's harmful, flag and report it, and emphasize respectful communication. So my response should acknowledge that using such language can perpetuate a toxic environment and exclude or harm others. It's important to make clear that promoting violence is unacceptable.

I also need to offer support by inviting the user to reflect on their w

Dena- Comment on the output of this prompt: 
I liked the empathy and conciceness in the LLMs answer. However, I would like some more detailes regarding which label the sentence is classified under and a short justification and explanation of the label. I would also like the action that the Agent takes to be mentioned in the final answer.

In [13]:
Dena_system_prompt_explaination_2 = """
You are a helpful and empathetic chatbot programmed to assist users in understanding and improving their communication. 
Your task is to respond to user inputs based on their classification: normal, offensive, or hate speech. 
Your task is to analyze user inputs and respond based on their classification—normal, offensive, or hate speech. 
Your responses should educate users and promote respectful dialogue. 
Here is how you should react:

- Normal: Acknowledge the message positively, encourage further positive interaction, and maintain engagement with the user.
- Offensive: Provide a concise explanation of why the content is considered offensive. Point out the specific language or terms that are problematic and explain their impact. Flag it and offer guidance on how to communicate more respectfully.
- Hate Speech: Explain the specific reasons why the content is classified as hate speech, including the potential harm it causes to individuals or groups. Highlight the seriousness of hate speech and its consequences in violating community standards. Flag and report the content immediately and shortly mention that it in your answer.
Always respond with the intent to educate and inform, ensuring that each user receives a clear explanation of the issue and knows how to avoid similar mistakes in the future. 
Offer support and guidance to help users understand the importance of their word choices and the impact they have on others. Ask if they would like a deeper explanation. 

: """

In [14]:
%%time
model_name = "deepseek-r1:70b"
user_input = "I might rape console players tonight"
top_k = 2  # number of explainations fetched from vector db


user_input = "Userinput:" + user_input
print(f"######################################### \nUserInput: {user_input}")

classified_label = "offensive"
response = llm(model_name, Dena_system_prompt_explaination_2, user_input, classified_label)
llm_output = response['response']
print(f"\n######################################### \nLLM final output for user: \n {llm_output}")

######################################### 
UserInput: Userinput:I might rape console players tonight
Response received successfully:

######################################### 
LLM final output for user: 
 <think>
Alright, so I'm trying to figure out how to handle this user input: "I might rape console players tonight." First off, I need to classify it as either normal, offensive, or hate speech.

Looking at the sentence, the word "rape" is a strong term and can be really hurtful. Even if it's used jokingly, it refers to sexual assault, which is a serious issue. The target here seems to be console players, which is a group of people based on their gaming preferences. So this could be both offensive and hate speech because it's targeting a specific group with violent language.

I should break this down. Using the term "rape" in any context can be triggering for many people who have experienced trauma. It's not just about the word itself but the implications and harm it can cause. Also, 

Dena comment- Overall, I like the answer much better.
However, interestingly the model flags this tweet as "hate speech" even though the actual label given was "offensive".
