Purpose of this notebook is to experiment with how the explainations are created.
In section 1, we first initialize functions for (loading llm, connecting vector db, and en embeddings model)
In section 2, we experiment and run this flow.

# Initialize

In [None]:
# install dependencies
!pip install sentence_transformers 
!pip install qdrant_client

In [1]:
# import libraries
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, Batch
import requests
import json
import os
from tqdm import tqdm
import pandas as pd
import json

# Change to the parent directory of the notebook
os.chdir('..')

## 

## Load Embeddings model

In [2]:
# EMBEDDINGS_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2', trust_remote_code=True)

In [2]:
# ONLY USE THIS IF YOU HAVE CUDA (NVIDIA GPU with 9 GB VRAM available:

EMBEDDINGS_MODEL = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True).cuda()

In [3]:
%%time

EMBEDDINGS_MODEL.encode("# testing how much time embeddings model take on your system")

CPU times: total: 281 ms
Wall time: 262 ms


array([-1.3991723,  2.6841042,  2.4770339, ..., -1.4864485,  1.8343586,
        1.2716196], dtype=float32)

## Configure Vector DB

In [4]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://7ef18c4d-2ef6-4fb0-9243-0ac62546593c.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="BR8zsNr5lEYrqJPL4EknUj2oRska2JO1nHwPFawlFMqZIrYMuGZ0Wg",
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='HateXplain_index_2'), CollectionDescription(name='test_index_'), CollectionDescription(name='HateXplain_index_3'), CollectionDescription(name='test_index'), CollectionDescription(name='HateXplain_index_1')]


In [5]:
def create_index(index_name):
    qdrant_client.create_collection(
        collection_name=index_name,
        vectors_config=VectorParams(size=len(EMBEDDINGS_MODEL.encode("")), distance=Distance.DOT)
    )

## Configure LLM

In [6]:
API_URL = "https://inf.cl.uni-trier.de/chat/"

def llm(model_name, system_prompt, input_query):
    # Construct the request payload
    payload = {
        "messages": [
            {"content": system_prompt, "role": "system"},
            {"content": input_query, "role": "user"}
        ],
        "model": model_name,
        "options": {}
    }
    
    # Set the request headers
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    
    # Send the POST request
    response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
    
    # Process the response
    if response.status_code == 200:
        print("Response received successfully:")
        response = response.json() # json.dumps(, indent=4)
    else:
        print(f"Failed to retrieve response. Status code: {response.status_code}")

    return response


# Experiments

Here we make experiments with system prompt, 

System prompt contains : what model is supposed to do with the input query. with an exmple output.
Input Query contains, one row of dataset.
explaination variable will have the output of model which will be an explaination.

In [7]:
# Make changes here.
system_prompt = "You are a helpful assistant."
input_query = "Hi"
model_name = "llama3.3:70b-instruct-q6_K"

In [8]:
response = llm(model_name, system_prompt, input_query)
explaination = response['response']
print(explaination)

Response received successfully:
How can I assist you today? Do you have any questions, need help with something, or just want to chat? I'm here to help!


### Load Dataset and convert to DF

In [7]:
# Load the dataset.json file
file_path = 'Data/dataset.json'
with open(file_path, 'r') as file:
    data = json.load(file)


In [8]:

# # Convert the JSON structure into a pandas DataFrame
# rows = []
# for row in data.items():
#     post_id = row[1]['post_id']
#     post_tokens = row[1]['post_tokens']
#     rationales =  row[1]['rationales']
#     # annotators =  row[1]['annotators']
#     tweet_text = " ".join(post_tokens)

    
    
#     # post_tokens = details.get("post_tokens", [])
#     # rationales = details.get("rationales", [])
    
#     # Aggregate all rationales
#     combined_rationale = [any(rat[i] for rat in rationales) for i in range(len(post_tokens))] if rationales else [0] * len(post_tokens)
    
#     # Extract key features
#     key_features = [post_tokens[i] for i, is_key in enumerate(combined_rationale) if is_key]

#     row_data = {
#         "post_id": post_id,
#         "tweet_text": tweet_text,
#         "key_features": key_features,
#     }

#     annotator_data = {}
#     for annotator in annotators:
#         annotator_id = annotator['annotator_id']
#         annotator_data[f"annotator_{annotator_id}_label"] = annotator['label']
#         annotator_data[f"annotator_{annotator_id}_target"] = ", ".join(annotator['target']) if annotator['target'] else None
    
#     row_data.update(annotator_data)
    
#     rows.append(row_data)
    
    
#     # for annotator in post_data["annotators"]:
#     #   rows.append({
#     #        "post_id": post_id,
#     #        "tweet_text": tweet_text,
#     #        "key_features": key_features,
#     #        "annotator_id": annotator["annotator_id"],
#     #        "label": annotator["label"],
#     #        #"target": ", ".join(annotator["target"]) if annotator["target"] else None
#     #    })

# df = pd.DataFrame(rows)

# # Display the first few rows of the dataframe
# df.tail()

In [9]:
from collections import Counter

rows = []
for row in data.items():
    post_id = row[1]['post_id']
    post_tokens = row[1]['post_tokens']
    rationales = row[1]['rationales']
    annotators = row[1]['annotators']
    tweet_text = " ".join(post_tokens)

    # Aggregate all rationales
    combined_rationale = [any(rat[i] for rat in rationales) for i in range(len(post_tokens))] if rationales else [0] * len(post_tokens)

    # Extract key features
    key_features = [post_tokens[i] for i, is_key in enumerate(combined_rationale) if is_key]

    annotator_data = {}
    all_targets = set()  # To store unique targets
    labels = []  # To collect labels for voting

    for index, annotator in enumerate(annotators, start=1):  # Use index as annotator_id
        annotator_data[f"annotator_{index}_label"] = annotator['label']
        annotator_data[f"annotator_{index}_target"] = ", ".join(annotator['target']) if annotator['target'] else None

        # Collect all targets
        all_targets.update(annotator['target'])

        # Collect labels for voting
        labels.append(annotator['label'])

    # Combine all unique targets
    combined_target = ", ".join(sorted(all_targets))

    # Perform majority voting for the label
    label_counts = Counter(labels)
    voted_label = label_counts.most_common(1)[0][0] if labels else None

    # Create the row
    row_data = {
        "post_id": post_id,
        "tweet_text": tweet_text,
        "key_features": key_features,
        "target": combined_target,
        "label": voted_label,
    }
    row_data.update(annotator_data)  # Add annotator-related fields

    rows.append(row_data)
df = pd.DataFrame(rows)

df.tail()

Unnamed: 0,post_id,tweet_text,key_features,target,label,annotator_1_label,annotator_1_target,annotator_2_label,annotator_2_target,annotator_3_label,annotator_3_target
20143,9989999_gab,if ur still on twitter tell carlton i said his...,"[alcoholic, drug, addicted, teenage, single, m...","Men, None, Other, Women",offensive,offensive,"Men, Women, Other",offensive,,normal,
20144,9990225_gab,when i first got on here and said i hate trump...,"[i, hate, trump, i, was, hating, niggers]","African, None, Other",offensive,offensive,African,offensive,"African, Other",normal,
20145,9991681_gab,was macht der moslem wenn der zion gegen seine...,[],"Islam, None, Other",normal,offensive,Islam,normal,Other,normal,
20146,9992513_gab,it is awful look at world demographics asians ...,"[asians, are, fucking, everywhere, white, geno...","Asian, Hispanic",hatespeech,hatespeech,Hispanic,hatespeech,Asian,offensive,Asian
20147,9998729_gab,the jewish globalist elite have only imported ...,"[imported, few, million, muslims, to, multicul...","African, Islam, Jewish",offensive,hatespeech,"African, Islam",offensive,"Islam, Jewish",offensive,"African, Islam, Jewish"


In [10]:
df.to_csv("Data/dataset.csv", index=True)

In [11]:
# for a in data.items():
#     print(a.key())
#     break
# # a[1].keys()

In [12]:
# a[1]['annotators']

In [13]:
# a[1]

# Deploy

In [28]:
index_name = "HateXplain_index"

In [29]:
create_index(index_name)

In [30]:
print(qdrant_client.get_collections())

collections=[CollectionDescription(name='HateXplain_index_2'), CollectionDescription(name='test_index_'), CollectionDescription(name='HateXplain_index_3'), CollectionDescription(name='HateXplain_index'), CollectionDescription(name='HateXplain_index_4'), CollectionDescription(name='test_index'), CollectionDescription(name='HateXplain_index_1')]


In [17]:
## Upload data into index

In [19]:
# filder data
data = [
    {key: d[key] for key in ('post_id', 'tweet_text', 'key_features', 'target', 'label')}
    for d in rows
]

In [25]:
def upsert_in_qdrant_collection(data_list, data_list_embeddings, ids):
    try:
        qdrant_client.upsert(
            collection_name=index_name,
            points=Batch(
                ids=ids,
                vectors=data_list_embeddings,
                payloads=data_list
            ),
        )
    except Exception as e:
        # traceback.print_exc()
        print(f"Exception in create_embeddings_and_upsert {e}")

In [31]:
def create_embeddings_and_upsert(batch_size=1000):

    print("Creating rows embeddings")
    ids = list(range(1, len(data) + 1))
    
    for i in tqdm(range(0, len(data), batch_size)):
        batch_data = data[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        
        batch_data_list_embeddings = []
        for row in batch_data:
            payload = (
                f"tweet_text: {row['tweet_text']}\n"
                f"key_features: {row['key_features']}\n"
                f"target: {row['target']}\n"
                f"label: {row['label']}\n"
                # WE add EXPLAINATIONS HERE
                # f"post_id: {row['post_id']}"
            )
            
            batch_data_list_embeddings.append(EMBEDDINGS_MODEL.encode(payload))
            #batch_ids.append(row['post_id'])
        

        # Call the Qdrant function
        upsert_in_qdrant_collection(batch_data, batch_data_list_embeddings, batch_ids)

In [None]:
create_embeddings_and_upsert(batch_size=100)

Creating rows embeddings


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                       | 173/202 [23:28<04:15,  8.82s/it]

## Query RAG

In [None]:
user_input = ""
top_k = 3
embedding = EMBEDDINGS_MODEL.encode(user_input)

In [None]:
search_result = self.qdrant_client.search(
    collection_name=index_name, query_vector=embedding, limit=top_k
)
search_result