# Let us also check what RAG's can do for this project

Thing is, with the API alone, not all examples could be processed (prompt would have been too big). RAG could solve this problem...

In [None]:
!pip install python-dotenv pandas llama-index langchain langchain-community llama-index-embeddings-langchain  sentence-transformers

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.schema import TextNode
from langchain.embeddings import HuggingFaceEmbeddings
from collections import Counter

In [3]:
from functions import load_jsonl, combine_data, compute_f1

# The usual setup

In [4]:
train_file_path = 'data_germeval/train.jsonl'
dev_file_path = 'data_germeval/development.jsonl'
test_file_path = 'data_germeval/test.jsonl'
train_data = load_jsonl(train_file_path)
dev_data = load_jsonl(dev_file_path)

In [5]:
train_data_labeled = combine_data(train_data)
train_df = combine_data(train_data, dataframe = True)

# MAIN PART

# Let's get started with LlamaIndex

What is happening here? Basically, we transform our textdata into vector representations. Then, given a new text element, we retrieve the most similar text elements (by comparing vector similarity) and average over the labels of the most similar ones. 

This method relies on a huggingface model for the embedding. Also, we don't send our data to any LLM for it to predict a label based on the retrieved information. Hence, we don't get input form the LLM (though we might want it, depending on the performance of this code) and no API is needed.

In [6]:
def prepare_training_nodes(df):
    """
    Prepares our data for indexing.
    :param df: dataframe 
    :return: TextNode object of our dataframe
    """
    training_nodes = []
    
    for index, row in df.iterrows():
        text_content = f"""
        Example Text: {row['text']}
        """
        
        node = TextNode(
        text=text_content,
        metadata={
        'original_text': row['text'],
        'bin_maj_label': row['bin_maj_label'],
        'bin_one_label': row['bin_one_label'],
        'bin_all_label': row['bin_all_label'],
        'multi_maj_label': row['multi_maj_label'],
        'disagree_bin_label': row['disagree_bin_label'],
        'index': index
        }
        )
        training_nodes.append(node)
    
    return training_nodes

In [7]:
def load_index(load_path, embed_model):
    """
    Loads precomputed index
    :param load_path: str, path to precomputed index
    :param embed_model: model to use for embeddings
    :return: loaded index 
    """
    storage_context = StorageContext.from_defaults(persist_dir=load_path)
    index = load_index_from_storage(
        storage_context, 
        embed_model=embed_model
    )
    print(f"Index loaded from {load_path}!")
    return index

In [8]:
def create_reload_vector_index(nodes, model):
    """
    Creates or reloads a vector index using our nodes and LlamaIndex
    :param nodes: list of nodes 
    :param model: str, indicates model to use for embeddings
    :return: generated or reloaded index
    """
    if model == "Bert":
        save_path = "vector_index_BERT"
        embed_model = HuggingFaceEmbeddings(model_name="dbmdz/bert-base-german-uncased")
    else:
        save_path = "vector_index_multilingual-e5"
        embed_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
        
    if os.path.exists(save_path):
        print("Loading existing index...")
        return load_index(save_path, embed_model)
    else:
        print("Creating index...")
        index = VectorStoreIndex(
            nodes,
            embed_model=embed_model
        )
        
        index.storage_context.persist(persist_dir=save_path)
        print(f"Index saved to {save_path}!")
        
        return index

In [9]:
def retrieve_similar_examples(query_text, index, top_k=5):
    """
    Retrieves similar examples using LlamaIndex
    :param query_text: str, query text
    :param index: index to use to retrieve similar examples
    :param top_k: int, number of similar examples
    :return: list of similar examples
    """
    retriever = index.as_retriever(similarity_top_k=top_k)
    similar_nodes = retriever.retrieve(query_text)

    results = []
    for node in similar_nodes:
        results.append({
            'text': node.node.text,
            'metadata': node.node.metadata,
            'similarity_score': node.score
        })
    
    return results

In [10]:
def weighted_aggregation(value_list, similarity_scores):
    """
    computes the weighted average of value_list
    :param value_list: list of values 
    :param similarity_scores: list of similarity scores corresponding to the values
    :return: float 
    """
    return round(sum(val * score for val, score in zip(value_list, similarity_scores)) / sum(similarity_scores))

In [11]:
def majority_aggregation(value_list):
    """
    Computes the majority vote of value_list
    :param value_list: list of values
    :return: float
    """
    return Counter(value_list).most_common(1)[0][0]

In [12]:
def predict_from_similar_examples(similar_examples, aggregation='majority'):
    """
    Predict target value from similar examples
    :param similar_examples: list of similar examples
    :param aggregation: str, aggregation method
    :return: list of predictions
    """
    if not similar_examples:
        return None

    target_values = [[ex['metadata']['bin_maj_label'] for ex in similar_examples],
                     [ex['metadata']['bin_one_label'] for ex in similar_examples],
                     [ex['metadata']['bin_all_label'] for ex in similar_examples],
                     [ex['metadata']['multi_maj_label'] for ex in similar_examples],
                     [ex['metadata']['disagree_bin_label'] for ex in similar_examples]]
    similarity_scores = [ex['similarity_score'] for ex in similar_examples]
    
    if aggregation == 'weighted':
        return [weighted_aggregation(target_values, similarity_scores) for target_values in target_values]
    
    elif aggregation == 'majority':
        return [majority_aggregation(target_values) for target_values in target_values]
        

In [13]:
def run_rag_pipeline(df, test_texts, model, top_k=5, aggregation='majority'):
    """
    Complete RAG pipeline using LlamaIndex
    :param df: reference dataframe with text elements and labels
    :param test_texts: list of test texts
    :param model: str indicating which model to use for embeddings. If 'Bert', uses "dbmdz/bert-base-german-uncased", else "intfloat/multilingual-e5-large"
    :param top_k: int, number of similar examples
    :param aggregation: str, aggregation method
    :return list of dictionaries of the form: {'query_text': , 'prediction':, 'num_similar_examples': 'similar_examples': }  
    """
    print("Step 1: Preparing training nodes...")
    training_nodes = prepare_training_nodes(df)
    
    print("Step 2: Creating vector index...")
    index = create_reload_vector_index(training_nodes, model)
    
    print("Step 3: Making predictions...")
    predictions = []
    
    for test_text in test_texts:
        similar_examples = retrieve_similar_examples(test_text, index, top_k)
       
        prediction = predict_from_similar_examples(similar_examples, aggregation)
        predictions.append({
            'query_text': test_text,
            'prediction': prediction,
            'num_similar_examples': len(similar_examples),
            'similar_examples': similar_examples[:2] 
        })
    
    return predictions

# Let's test this

In [14]:
dev_data_labeled = combine_data(dev_data)
dev_df = combine_data(dev_data, dataframe = True)
test_data = [dev_data_labeled[i]['text'] for i in range(100)]

In [15]:
def data_to_dataframe(prediction):
    """
    takes the output of run_rag_pipeline and turns it into a pandas dataframe with fitting columns for comparison
    :param prediction: list of dictionaries (output of run_rag_pipeline)
    :return: dataframe
    """
    prediction = [{'text': p['query_text'], 
                'bin_maj_label': p['prediction'][0],
                'bin_one_label': p['prediction'][1],
                'bin_all_label': p['prediction'][2], 
                'multi_maj_label':p['prediction'][3],
                'disagree_bin_label': p['prediction'][4]} for p in prediction]
    return pd.DataFrame(prediction, columns=prediction[0].keys())

In [None]:
#do you want to compute the vector index?
run_this = False

# First with the "dbmdz/bert-base-german-uncased" model

In [None]:
if run_this:
    predictions_Bert_5 = run_rag_pipeline(train_df, test_data, "Bert", top_k = 5)
    predictions_Bert_5_df = data_to_dataframe(predictions_Bert_5)
    predictions_Bert_10 = run_rag_pipeline(train_df, test_data, "Bert", top_k=10)
    predictions_Bert_10_df = data_to_dataframe(predictions_Bert_10)
    predictions_Bert_20 = run_rag_pipeline(train_df, test_data, "Bert", top_k=20)
    predictions_Bert_20_df = data_to_dataframe(predictions_Bert_20)

In [17]:
print('with k = 5')
compute_f1(dev_df.iloc[:100], predictions_Bert_5_df)
print('with k = 10')
compute_f1(dev_df.iloc[:100], predictions_Bert_10_df) #performs best here
print('with k = 20')
compute_f1(dev_df.iloc[:100], predictions_Bert_20_df)

with k = 5
Dev set F1 score Bin Maj: 0.6504735195771761
Dev set F1 score Bin One: 0.5367466666666667
Dev set F1 score Bin All: 0.8348235294117647
Dev set F1 score Multi Maj: 0.6423015873015874
Dev set F1 score Disagree Bin: 0.632291543814129
with k = 10
Dev set F1 score Bin Maj: 0.7036011396011397
Dev set F1 score Bin One: 0.5529845755581921
Dev set F1 score Bin All: 0.8570327552986512
Dev set F1 score Multi Maj: 0.7003333333333334
Dev set F1 score Disagree Bin: 0.5868253968253968
with k = 20
Dev set F1 score Bin Maj: 0.7124080882352942
Dev set F1 score Bin One: 0.5031111111111111
Dev set F1 score Bin All: 0.8389743589743591
Dev set F1 score Multi Maj: 0.6386732919254658
Dev set F1 score Disagree Bin: 0.5875908099088692


Let's also check a different aggregation mode:

In [None]:
if run_this:
    predictions_Bert_5_w = run_rag_pipeline(train_df, test_data, "Bert", top_k=5, aggregation='weighted')
    predictions_Bert_5_w_df = data_to_dataframe(predictions_Bert_5_w)
    predictions_Bert_10_w = run_rag_pipeline(train_df, test_data, "Bert", top_k=10, aggregation='weighted')
    predictions_Bert_10_w_df = data_to_dataframe(predictions_Bert_10_w)
    predictions_Bert_20_w = run_rag_pipeline(train_df, test_data, "Bert", top_k=20, aggregation='weighted')
    predictions_Bert_20_w_df = data_to_dataframe(predictions_Bert_20_w)

In [19]:
print('with k = 5')
compute_f1(dev_df.iloc[:100], predictions_Bert_5_w_df)
print('with k = 10')
compute_f1(dev_df.iloc[:100], predictions_Bert_10_w_df) #perfroms best here
print('with k = 20')
compute_f1(dev_df.iloc[:100], predictions_Bert_20_w_df)

with k = 5
Dev set F1 score Bin Maj: 0.6504735195771761
Dev set F1 score Bin One: 0.5367466666666667
Dev set F1 score Bin All: 0.8348235294117647
Dev set F1 score Multi Maj: 0.29154429962052464
Dev set F1 score Disagree Bin: 0.632291543814129
with k = 10
Dev set F1 score Bin Maj: 0.7227236467236468
Dev set F1 score Bin One: 0.5471601703242712
Dev set F1 score Bin All: 0.8722285714285715
Dev set F1 score Multi Maj: 0.26563765182186233
Dev set F1 score Disagree Bin: 0.5791387559808612
with k = 20
Dev set F1 score Bin Maj: 0.7211887382690303
Dev set F1 score Bin One: 0.5031111111111111
Dev set F1 score Bin All: 0.8389743589743591
Dev set F1 score Multi Maj: 0.14662376779846661
Dev set F1 score Disagree Bin: 0.5800913242009131


# Now with the "intfloat/multilingual-e5-large" model

In [None]:
if run_this:
    predictions_e5_5 = run_rag_pipeline(train_df, test_data, "notBert", top_k = 5)
    predictions_e5_5_df = data_to_dataframe(predictions_e5_5)
    predictions_e5_10 = run_rag_pipeline(train_df, test_data, "notBert", top_k = 10)
    predictions_e5_10_df = data_to_dataframe(predictions_e5_10)
    predictions_e5_20 = run_rag_pipeline(train_df, test_data, "notBert", top_k = 20)
    predictions_e5_20_df = data_to_dataframe(predictions_e5_20)

In [21]:
print('with k = 5')
compute_f1(dev_df.iloc[:100], predictions_e5_5_df) #performs best here
print('with k = 10')
compute_f1(dev_df.iloc[:100], predictions_e5_10_df)
print('with k = 20')
compute_f1(dev_df.iloc[:100], predictions_e5_20_df)

with k = 5
Dev set F1 score Bin Maj: 0.7029249011857708
Dev set F1 score Bin One: 0.5956626506024096
Dev set F1 score Bin All: 0.8341621621621621
Dev set F1 score Multi Maj: 0.6412717536813922
Dev set F1 score Disagree Bin: 0.5785291425083918
with k = 10
Dev set F1 score Bin Maj: 0.7135690396559962
Dev set F1 score Bin One: 0.5674819541793075
Dev set F1 score Bin All: 0.8188235294117647
Dev set F1 score Multi Maj: 0.6366279069767442
Dev set F1 score Disagree Bin: 0.5483636363636363
with k = 20
Dev set F1 score Bin Maj: 0.6420833333333335
Dev set F1 score Bin One: 0.57984
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.6428571428571428
Dev set F1 score Disagree Bin: 0.6078817733990148


In [None]:
if run_this:
    predictions_e5_5_w = run_rag_pipeline(train_df, test_data, "notBert", top_k=5, aggregation='weighted')
    predictions_e5_5_w_df = data_to_dataframe(predictions_e5_5_w)
    predictions_e5_10_w = run_rag_pipeline(train_df, test_data, "notBert", top_k=10, aggregation='weighted')
    predictions_e5_10_w_df = data_to_dataframe(predictions_e5_10_w)
    predictions_e5_20_w = run_rag_pipeline(train_df, test_data, "notBert", top_k=20, aggregation='weighted')
    predictions_e5_20_w_df = data_to_dataframe(predictions_e5_20_w)

In [23]:
print('with k = 5')
compute_f1(dev_df.iloc[:100], predictions_e5_5_w_df)
print('with k = 10')
compute_f1(dev_df.iloc[:100], predictions_e5_10_w_df)
print('with k = 20')
compute_f1(dev_df.iloc[:100], predictions_e5_20_w_df) #performs best here

with k = 5
Dev set F1 score Bin Maj: 0.7029249011857708
Dev set F1 score Bin One: 0.5956626506024096
Dev set F1 score Bin All: 0.8341621621621621
Dev set F1 score Multi Maj: 0.5241558441558442
Dev set F1 score Disagree Bin: 0.5785291425083918
with k = 10
Dev set F1 score Bin Maj: 0.7056043773835186
Dev set F1 score Bin One: 0.5923187052598817
Dev set F1 score Bin All: 0.8188235294117647
Dev set F1 score Multi Maj: 0.5333224222585925
Dev set F1 score Disagree Bin: 0.567085346215781
with k = 20
Dev set F1 score Bin Maj: 0.6676488095238095
Dev set F1 score Bin One: 0.605739626227431
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.5302641509433962
Dev set F1 score Disagree Bin: 0.6079967360261118


# Conclusion

Pro: No issues with reproducibility here
Contra: Results are worse compared to the results from "comparisonAPI.ipynb".

# Let's see what happens when we not only use RAG but also the API

# Let's load the API key

In [24]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_25")
print(f"API Key loaded: {OPENAI_API_KEY is not None}")

API Key loaded: True
