In [None]:
!pip install python-dotenv pandas llama-index langchain langchain-community llama-index-embeddings-langchain  sentence-transformers llama-index-llms-openai deepseek

In [1]:
from dotenv import load_dotenv
from llama_index.core import KeywordTableIndex, Settings, PromptTemplate
from openai import OpenAI as OpenAIClient 
from llama_index.llms.openai import OpenAI as LlamaOpenAI
import os
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.schema import TextNode
from langchain.embeddings import HuggingFaceEmbeddings
from pathlib import Path
import pandas as pd

In [2]:
from functions import load_jsonl, combine_data, extract_dict_from_response, compute_f1, check_df, find_best_model

# Global Variables 

In [3]:
#data
train_file_path = Path('data_germeval/train.jsonl')
dev_file_path = Path('data_germeval/development.jsonl')

#models
bert_model = "dbmdz/bert-base-german-uncased"
multilingual_e5_model = "intfloat/multilingual-e5-large"

#save/load paths
vector_index_bert_path = Path("vector_index_BERT")
vector_index_multilingual_e5_path = Path("vector_index_multilingual-e5")
keyword_index_save_path = Path("keyword_index")

#API keys
load_dotenv()
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY_25")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_25")

#which openAI model to use
llm_model_openAI = "gpt-3.5-turbo"
llm_model_deepseek = "deepseek-chat"
deepseek_url = "https://api.deepseek.com/v1"

#do we want to compute the vector index?
run_this = True

# The usual setup

In [17]:
train_data = load_jsonl(train_file_path)
train_data_labeled = combine_data(train_data)
train_df = combine_data(train_data, dataframe = True)

dev_data = load_jsonl(dev_file_path)
dev_data_labeled = combine_data(dev_data)
dev_df = combine_data(dev_data, dataframe = True)

test_data = [dev_data_labeled[i]['text'] for i in range(100)]

# MAIN PART

# Index Setup

In [4]:
def prepare_training_nodes(df):
    """
    Prepares our data for indexing.
    :param df: dataframe 
    :return: TextNode object of our dataframe
    """
    training_nodes = []
    
    for index, row in df.iterrows():
        text_content = f"""
        Example Text: {row['text']}
        """
        
        node = TextNode(
        text=text_content,
        metadata={
        'original_text': row['text'],
        'bin_maj_label': row['bin_maj_label'],
        'bin_one_label': row['bin_one_label'],
        'bin_all_label': row['bin_all_label'],
        'multi_maj_label': row['multi_maj_label'],
        'disagree_bin_label': row['disagree_bin_label'],
        'index': index
        }
        )
        training_nodes.append(node)
    
    return training_nodes

In [5]:
def load_vector_index(load_path, embed_model):
    """
    Loads precomputed index
    :param load_path: str, path to precomputed index
    :param embed_model: model to use for embeddings
    :return: loaded index 
    """
    storage_context = StorageContext.from_defaults(persist_dir=load_path)
    index = load_index_from_storage(
        storage_context, 
        embed_model=embed_model
    )
    print(f"Index loaded from {load_path}!")
    return index

In [6]:
def create_reload_vector_index(nodes, model_name):
    """
    Creates or reloads a vector index using our nodes and LlamaIndex
    :param nodes: list of nodes 
    :param model_name: str, indicates model to use for embeddings
    :return: generated or reloaded index
    """
    if model_name == "Bert":
        embed_model = HuggingFaceEmbeddings(model_name=bert_model)
        save_path = vector_index_bert_path
    else:
        embed_model = HuggingFaceEmbeddings(model_name=multilingual_e5_model)
        save_path = vector_index_multilingual_e5_path
    
    if os.path.exists(save_path):
        print("Loading existing index...")
        return load_vector_index(save_path, embed_model)
    else:
        print("Creating index...")
        index = VectorStoreIndex(
            nodes,
            embed_model=embed_model
        )
        
        index.storage_context.persist(persist_dir=save_path)
        print(f"Index saved to {save_path}!")
        
        return index

In [7]:
def load_keyword_index(load_path):
    """
    Loads precomputed index
    :param load_path: str, path to precomputed index
    :return: loaded index 
    """
    storage_context = StorageContext.from_defaults(persist_dir=load_path)
    index = load_index_from_storage(
        storage_context, 
    )
    print(f"Index loaded from {load_path}!")
    return index

In [8]:
def create_reload_keyword_index(nodes):
    """
    Creates or reloads a vector index using our nodes and LlamaIndex
    :param nodes: list of nodes 
    :return: generated or reloaded index
    """
    llm = LlamaOpenAI(api_key=OPENAI_API_KEY, model=llm_model_openAI)
    
    Settings.llm = llm
    
    if os.path.exists(keyword_index_save_path):
        print("Loading existing keyword index...")
        storage_context = StorageContext.from_defaults(persist_dir=keyword_index_save_path)
        return load_index_from_storage(storage_context)  
    else:
        print("Creating keyword index...")
        index = KeywordTableIndex(nodes)  
        
        index.storage_context.persist(persist_dir=keyword_index_save_path)
        print(f"Keyword index saved to {keyword_index_save_path}!")
        
        return index

# API Prompt Template Part

In [9]:
custom_prompt = PromptTemplate("""
    **Task:** Predict sexism annotation labels for a new text based on the following label definitions.
    
    **Label Definitions:**
    - 'bin_maj_label': A majority of annotators found the text to be sexist.
    - 'bin_one_label': At least one annotator found the text to be sexist.
    - 'bin_all_label': All annotators found the text to be sexist.
    - 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
    - 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.
    
    **Examples from the Dataset:**
    '{context_str}'
    
    **Text to Analyze:**
    '{query_str}'
    
    **Instructions:**
    Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in exactly the following format 
    (no spaces or newlines!). <value> must always be an integer::
    {{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
    """)

In [10]:
def build_custom_prompt(query_text, top_k_examples):
    context_str = "\n".join([f"Example {i+1}: {node.text} --> bin_maj_label : {node.metadata['bin_maj_label']}, "
                             f"bin_one_label: {node.metadata['bin_one_label']}, "
                             f"bin_all_label: {node.metadata['bin_all_label']}, "
                             f"multi_maj_label: {node.metadata['multi_maj_label']}, "
                             f"disagree_bin_label: {node.metadata['disagree_bin_label']}."  for i, node in enumerate(top_k_examples)])
    return custom_prompt.format(query_str=query_text, context_str=context_str)
    

In [11]:
def retrieve_with_llm_open_ai(query_text, index, top_k=5):
    """
    Use query engine with custom prompt to generate labels in the format we need
    :param query_text: str, text to analyze
    :param index: index object, index to retrieve similar examples from.
    :param top_k: int, number of examples to compare
    :return: str, response
    """
    llm = LlamaOpenAI(api_key=OPENAI_API_KEY)
    
    retriever = index.as_retriever(similarity_top_k=top_k)
    retrieved_nodes = retriever.retrieve(query_text)
    
    prompt = build_custom_prompt(query_text, retrieved_nodes)
    response = llm.complete(prompt)
    
    return response.text

In [12]:
def retrieve_with_llm_deepseek(query_text, index, top_k=5):
    """
    Use query engine with custom prompt to generate labels in the format we need
    :param query_text: str, text to analyze
    :param index: index object, index to retrieve similar examples from.
    :param top_k: int, number of éxamples to compare
    :return: str, response
    """
    client = OpenAIClient(api_key=DEEPSEEK_API_KEY,base_url=deepseek_url)
    retriever = index.as_retriever(similarity_top_k=top_k)
    retrieved_nodes = retriever.retrieve(query_text)
    
    prompt = build_custom_prompt(query_text, retrieved_nodes)

    response = client.chat.completions.create(
        model=llm_model_deepseek,
        messages=[
            {"role": "user", "content": prompt}
        ],
        #temperature=0.1,  
        max_tokens=150
    )
    
    return response.choices[0].message.content

In [13]:
def retrieve_with_llm(query_text, index, llm_name, top_k=5):
    """
    Use query engine with custom prompt to generate labels in the format we need
    :param query_text: str, text to analyze
    :param index: index object, index to retrieve similar examples from.
    :param llm_name: str, llm to use, either OpenAi or DeepSeek
    :param top_k: int, number of examples to compare
    :return: str, response
    """
    if llm_name == 'OpenAI':
        return retrieve_with_llm_open_ai(query_text, index, top_k)
    elif llm_name == 'DeepSeek':
        return retrieve_with_llm_deepseek(query_text, index, top_k)
    else:
        print('llm_name has to be either OpenAI or DeepSeek')


In [14]:
def get_proper_response(query_text, index, llm_name, top_k=5):
    """
    makes sure we get the correct format and returns it
    :param query_text: str, text to analyze 
    :param index: index object, index to retrieve similar examples from.
    :param llm_name: str, name of llm to use, has to be either "OpenAi" or "DeepSeek".
    :param top_k: int, number of éxamples to compare
    :return: dictionary of predicted labels
    """
    response = extract_dict_from_response(retrieve_with_llm(query_text, index, llm_name, top_k))
    
    while not response[0]:
        response = extract_dict_from_response(retrieve_with_llm(query_text, index, llm_name, top_k))
    
    return response[1]
        

In [15]:
def run_rag_api_pipeline(df, test_texts, model_name, index_name, llm_name, top_k=5):
    """
    Complete RAG pipeline using LlamaIndex
    :param df: reference dataframe with text elements and labels
    :param test_texts: list of test texts
    :param model_name: str indicating which model to use for embeddings. If 'Bert', uses "dbmdz/bert-base-german-uncased", else "intfloat/multilingual-e5-large"
    :param index_name: str, name of index to be used, either VectorStoreIndex or KeywordStoreIndex
    :param llm_name: str, name of llm to use, has to be either "OpenAi" or "DeepSeek".
    :param top_k: int, number of similar examples
    :return: dataframe  
    """
    print("Step 1: Preparing training nodes...")
    training_nodes = prepare_training_nodes(df)
    
    print("Step 2: Creating vector index...")
    if index_name == 'VectorStoreIndex':
        index = create_reload_vector_index(training_nodes, model_name)
    elif index_name == 'KeywordStoreIndex':
        index = create_reload_keyword_index(training_nodes)
    else:
        print('no valid index')
    
    print("Step 3: Making predictions...")
    predictions = []
    
    for test_text in test_texts:
        prediction = get_proper_response(test_text, index, llm_name, top_k)
        prediction_dict = dict()
        prediction_dict['text'] = test_text
        prediction_dict.update(prediction)
        predictions.append(prediction_dict)
    
    return pd.DataFrame(predictions, columns=predictions[0].keys())

# Let's test

# First DeepSeeK

In [None]:
if run_this: #all of this cost less than 0.5$ and took less than 90min
    prds_Bert_VSI_DS_5 = run_rag_api_pipeline(train_df, test_data, 'Bert', 'VectorStoreIndex', 'DeepSeek', top_k=5)
    prds_Bert_VSI_DS_10 = run_rag_api_pipeline(train_df, test_data, 'Bert', 'VectorStoreIndex', 'DeepSeek', top_k=10) 
    prds_Bert_VSI_DS_20 = run_rag_api_pipeline(train_df, test_data, 'Bert', 'VectorStoreIndex', 'DeepSeek', top_k=20)
    prds_e5_VSI_DS_5 = run_rag_api_pipeline(train_df, test_data, 'e5', 'VectorStoreIndex', 'DeepSeek', top_k=5)
    prds_e5_VSI_DS_10 = run_rag_api_pipeline(train_df, test_data, 'e5', 'VectorStoreIndex', 'DeepSeek', top_k=10)
    prds_e5_VSI_DS_20 = run_rag_api_pipeline(train_df, test_data, 'e5', 'VectorStoreIndex', 'DeepSeek', top_k=20) 
    prds_keyword_DS_5 = run_rag_api_pipeline(train_df, test_data, '', 'KeywordStoreIndex', 'DeepSeek', top_k=5) 
    prds_keyword_DS_10 = run_rag_api_pipeline(train_df, test_data, '', 'KeywordStoreIndex', 'DeepSeek', top_k=10)
    prds_kewyowrd_DS_20 = run_rag_api_pipeline(train_df, test_data, '', 'KeywordStoreIndex', 'DeepSeek', top_k=20)
    deepseek = [prds_Bert_VSI_DS_5, prds_Bert_VSI_DS_10, prds_Bert_VSI_DS_20, prds_e5_VSI_DS_5, prds_e5_VSI_DS_10, prds_e5_VSI_DS_20, prds_keyword_DS_5, prds_keyword_DS_10, prds_kewyowrd_DS_20]

In [39]:
for df in deepseek:
    compute_f1(dev_df.iloc[:100], df)
    print('\n')

Dev set F1 score Bin Maj: 0.74512
Dev set F1 score Bin One: 0.8197839135654261
Dev set F1 score Bin All: 0.8675411446818964
Dev set F1 score Multi Maj: 0.6296413975279106
Dev set F1 score Disagree Bin: 0.753844544408411


Dev set F1 score Bin Maj: 0.7827267824398664
Dev set F1 score Bin One: 0.8197839135654261
Dev set F1 score Bin All: 0.88
Dev set F1 score Multi Maj: 0.666645189435887
Dev set F1 score Disagree Bin: 0.713521275472495


Dev set F1 score Bin Maj: 0.801303635768097
Dev set F1 score Bin One: 0.7792948717948718
Dev set F1 score Bin All: 0.8693333333333334
Dev set F1 score Multi Maj: 0.6964141414141415
Dev set F1 score Disagree Bin: 0.6863054187192118


Dev set F1 score Bin Maj: 0.8183361823361824
Dev set F1 score Bin One: 0.7538310893512852
Dev set F1 score Bin All: 0.8619780219780221
Dev set F1 score Multi Maj: 0.6897341628959276
Dev set F1 score Disagree Bin: 0.6540350877192982


Dev set F1 score Bin Maj: 0.8001099764336214
Dev set F1 score Bin One: 0.7314614121510672
Dev

In [40]:
find_best_model(dev_df.iloc[:100], deepseek)

The best performing model is Bert_VSI_10
Dev set F1 score Bin Maj: 0.7827267824398664
Dev set F1 score Bin One: 0.8197839135654261
Dev set F1 score Bin All: 0.88
Dev set F1 score Multi Maj: 0.666645189435887
Dev set F1 score Disagree Bin: 0.713521275472495


# Now OpenAI

In [None]:
if run_this:
    prds_Bert_VSI_OA_5 = run_rag_api_pipeline(train_df, test_data, "Bert", 'VectorStoreIndex', 'OpenAI',  top_k=5)
    prds_Bert_VSI_OA_10 = run_rag_api_pipeline(train_df, test_data, "Bert",'VectorStoreIndex', 'OpenAI',top_k=10)
    prds_Bert_VSI_OA_20 = run_rag_api_pipeline(train_df, test_data, "Bert", 'VectorStoreIndex', 'OpenAI', top_k=20)
    prds_e5_VSI_OA_5 = run_rag_api_pipeline(train_df, test_data, "e5", 'VectorStoreIndex','OpenAI',  top_k=5)
    prds_e5_VSI_OA_10 = run_rag_api_pipeline(train_df, test_data, "e5", 'VectorStoreIndex','OpenAI', top_k=10)
    prds_e5_VSI_OA_20 = run_rag_api_pipeline(train_df, test_data, "e5", 'VectorStoreIndex','OpenAI', top_k=20)
    prds_keyword_OA_5 = run_rag_api_pipeline(train_df, test_data, "", 'KeywordStoreIndex','OpenAI',top_k=5)
    prds_keyword_OA_10 = run_rag_api_pipeline(train_df, test_data, "", 'KeywordStoreIndex','OpenAI', top_k=10)
    prds_kewyowrd_OA_20 = run_rag_api_pipeline(train_df, test_data, "", 'KeywordStoreIndex','OpenAI',top_k=20)
    openai = [prds_Bert_VSI_OA_5, prds_Bert_VSI_OA_10, prds_Bert_VSI_OA_20, prds_e5_VSI_OA_5, prds_e5_VSI_OA_10, prds_e5_VSI_OA_20, prds_keyword_OA_5, prds_keyword_OA_10, prds_kewyowrd_OA_20]

In [19]:
for df in openai:
    compute_f1(dev_df.iloc[:100], df)
    print('\n')

Dev set F1 score Bin Maj: 0.7413839708068659
Dev set F1 score Bin One: 0.6660726274031126
Dev set F1 score Bin All: 0.8619780219780221
Dev set F1 score Multi Maj: 0.6910900745840505
Dev set F1 score Disagree Bin: 0.6085714285714287


Dev set F1 score Bin Maj: 0.7484918495700167
Dev set F1 score Bin One: 0.676883116883117
Dev set F1 score Bin All: 0.907985347985348
Dev set F1 score Multi Maj: 0.6943574990410433
Dev set F1 score Disagree Bin: 0.632291543814129


Dev set F1 score Bin Maj: 0.7342730953257268
Dev set F1 score Bin One: 0.708684343179756
Dev set F1 score Bin All: 0.8466968325791855
Dev set F1 score Multi Maj: 0.6893650793650794
Dev set F1 score Disagree Bin: 0.6288284747243149


Dev set F1 score Bin Maj: 0.6612087098207674
Dev set F1 score Bin One: 0.6492613807657522
Dev set F1 score Bin All: 0.8669738863287252
Dev set F1 score Multi Maj: 0.6627702927478376
Dev set F1 score Disagree Bin: 0.5911940459497358


Dev set F1 score Bin Maj: 0.6088203712225801
Dev set F1 score Bin On

In [20]:
find_best_model(dev_df.iloc[:100], openai)

The best performing model is Bert_VSI_10
Dev set F1 score Bin Maj: 0.7484918495700167
Dev set F1 score Bin One: 0.676883116883117
Dev set F1 score Bin All: 0.907985347985348
Dev set F1 score Multi Maj: 0.6943574990410433
Dev set F1 score Disagree Bin: 0.632291543814129
