# Let us also check what RAG can do for this project - VectorStoreIndex Version

Thing is, with the API alone, not all examples could be processed (prompt would have been too big). RAG could solve this problem, as it enables us to scan all examples from the training data and retrieve the most relevant ones.

In [None]:
!pip install python-dotenv pandas llama-index langchain langchain-community llama-index-embeddings-langchain  sentence-transformers llama-index-llms-openai

In [151]:
import os
import pandas as pd
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.schema import TextNode
from langchain.embeddings import HuggingFaceEmbeddings
from collections import Counter
from pathlib import Path

In [152]:
from functions import load_jsonl, combine_data, compute_f1

# Global Variables

In [153]:
#data
train_file_path = Path('data_germeval/train.jsonl')
dev_file_path = Path('data_germeval/development.jsonl')

#models
bert_model = "dbmdz/bert-base-german-uncased"
multilingual_e5_model = "intfloat/multilingual-e5-large"

#save/load paths
vector_index_bert_path = Path("vector_index_BERT")
vector_index_multilingual_e5_path = Path("vector_index_multilingual-e5")

#do we want to compute the vector index?
run_this = True

# The usual setup

In [154]:
train_data = load_jsonl(train_file_path)
train_data_labeled = combine_data(train_data)
train_df = combine_data(train_data, dataframe = True)

dev_data = load_jsonl(dev_file_path)
dev_data_labeled = combine_data(dev_data)
dev_df = combine_data(dev_data, dataframe = True)

test_data = [dev_data_labeled[i]['text'] for i in range(100)]

# MAIN PART

# Let's get started with VectorStoreIndex

What is happening here? Basically, we transform our textdata into vector representations. Then, given a new text element, we retrieve the most similar text elements (by comparing vector similarity) and average over the labels of the most similar ones. 

VectorStoreIndex (here) relies on a huggingface model for the embedding. We don't send our data to any LLM for it to predict a label based on the retrieved information. Hence, we don't get input form the LLM (though we might want it, depending on the performance of this code) and no API key is needed.

In [155]:
def prepare_training_nodes(df):
    """
    Prepares our data for indexing.
    :param df: dataframe 
    :return: TextNode object of our dataframe
    """
    training_nodes = []
    
    for index, row in df.iterrows():
        text_content = f"""
        Example Text: {row['text']}
        """
        
        node = TextNode(
        text=text_content,
        metadata={
        'original_text': row['text'],
        'bin_maj_label': row['bin_maj_label'],
        'bin_one_label': row['bin_one_label'],
        'bin_all_label': row['bin_all_label'],
        'multi_maj_label': row['multi_maj_label'],
        'disagree_bin_label': row['disagree_bin_label'],
        'index': index
        }
        )
        training_nodes.append(node)
    
    return training_nodes

In [156]:
def load_index(load_path, embed_model):
    """
    Loads precomputed index
    :param load_path: str, path to precomputed index
    :param embed_model: model to use for embeddings
    :return: loaded index 
    """
    storage_context = StorageContext.from_defaults(persist_dir=load_path)
    index = load_index_from_storage(
        storage_context, 
        embed_model=embed_model
    )
    print(f"Index loaded from {load_path}!")
    return index

In [157]:
def create_reload_vector_index(nodes, model):
    """
    Creates or reloads a vector index using our nodes and LlamaIndex
    :param nodes: list of nodes 
    :param model: str, indicates model to use for embeddings
    :return: generated or reloaded index
    """
    if model == "Bert":
        embed_model = HuggingFaceEmbeddings(model_name=bert_model)
        save_path = vector_index_bert_path
    else:
        embed_model = HuggingFaceEmbeddings(model_name=multilingual_e5_model)
        save_path = vector_index_multilingual_e5_path
        
    if os.path.exists(save_path):
        print("Loading existing index...")
        return load_index(save_path, embed_model)
    else:
        print("Creating index...")
        index = VectorStoreIndex(
            nodes,
            embed_model=embed_model
        )
        
        index.storage_context.persist(persist_dir=save_path)
        print(f"Index saved to {save_path}!")
        
        return index

In [158]:
def retrieve_similar_examples(query_text, index, top_k=5):
    """
    Retrieves similar examples using LlamaIndex
    :param query_text: str, query text
    :param index: index to use to retrieve similar examples
    :param top_k: int, number of similar examples
    :return: list of similar examples
    """
    retriever = index.as_retriever(similarity_top_k=top_k)
    similar_nodes = retriever.retrieve(query_text)

    results = []
    for node in similar_nodes:
        results.append({
            'text': node.node.text,
            'metadata': node.node.metadata,
            'similarity_score': node.score
        })
    
    return results

In [160]:
def majority_aggregation(value_list):
    """
    Computes the majority vote of value_list. This makes the most sense as we are dealing with categorical data. 
    :param value_list: list of values
    :return: float
    """
    return Counter(value_list).most_common(1)[0][0]

In [161]:
def predict_from_similar_examples(similar_examples):
    """
    Predict target value from similar examples
    :param similar_examples: list of similar examples
    :return: list of predictions
    """
    if not similar_examples:
        return None

    target_values = [[ex['metadata']['bin_maj_label'] for ex in similar_examples],
                     [ex['metadata']['bin_one_label'] for ex in similar_examples],
                     [ex['metadata']['bin_all_label'] for ex in similar_examples],
                     [ex['metadata']['multi_maj_label'] for ex in similar_examples],
                     [ex['metadata']['disagree_bin_label'] for ex in similar_examples]]
    
    return [majority_aggregation(target_values) for target_values in target_values]
        

In [162]:
def data_to_dataframe(prediction):
    """
    takes the output of run_rag_pipeline and turns it into a pandas dataframe with fitting columns for comparison
    :param prediction: list of dictionaries of the form: {'query_text': , 'prediction':, 'num_similar_examples': 'similar_examples': } 
    :return: dataframe
    """
    prediction = [{'text': p['query_text'], 
                'bin_maj_label': p['prediction'][0],
                'bin_one_label': p['prediction'][1],
                'bin_all_label': p['prediction'][2], 
                'multi_maj_label':p['prediction'][3],
                'disagree_bin_label': p['prediction'][4],
                'similar_text_1': p['similar_examples'][0]['text'].replace("\n        Example Text:", ""),
                'similar_text_2': p['similar_examples'][1]['text'].replace("\n        Example Text:", "")} for p in prediction]
    return pd.DataFrame(prediction, columns=prediction[0].keys())

In [163]:
def run_rag_pipeline(df, test_texts, model, top_k=5):
    """
    Complete RAG pipeline using LlamaIndex
    :param df: reference dataframe with text elements and labels
    :param test_texts: list of test texts
    :param model: str indicating which model to use for embeddings. If 'Bert', uses "dbmdz/
    :param top_k: int, number of similar examples
    :return: dataframe  
    """
    print("Step 1: Preparing training nodes...")
    training_nodes = prepare_training_nodes(df)
    
    print("Step 2: Creating/Reloading vector index...")
    index = create_reload_vector_index(training_nodes, model)
    
    print("Step 3: Making predictions...")
    predictions = []
    
    for test_text in test_texts:
        similar_examples = retrieve_similar_examples(test_text, index, top_k)
        prediction = predict_from_similar_examples(similar_examples)
        predictions.append({
            'query_text': test_text,
            'prediction': prediction,
            'num_similar_examples': len(similar_examples),
            'similar_examples': similar_examples[:2] 
        })
    #print(predictions)
    
    return data_to_dataframe(predictions)

# Let's see how well the indexing worked:

In [None]:
if run_this:
    test_index_Bert = run_rag_pipeline(train_df,test_data[:10], "Bert")
    test_index_e5 = run_rag_pipeline(train_df,test_data[:10], "notBert")

In [148]:
for index, row in test_index_Bert.iterrows():
    print('Text')
    print(row['text'])
    print('\n Retrieved "similar" ones')
    print(row['similar_text_1'])
    print(row['similar_text_2'])

Text
Das ist ein richtig gutes Portrait von Greta!

 Retrieved "similar" ones
 was für ein foto!  ein dicker otto waalkes im dirndl und putin?
        
 Und der Missbrauch von muslimischen Frauen ist wurscht? Was für ein menschenverachtendes Posting!
        
Text
bei den dort üblichen kalaschnikows wärs eher eine zahl mit ein paar nullen mehr ...

 Retrieved "similar" ones
 Mit einer Frau an der Spitze würde es nur noch diese bequemen schlabbrigen Baumwollzelte im Angebot geben.  Obwohl ... Modell "Lieschen Modermöse" hat auch so seinen Reiz.
        
 Man sollte schon noch unterscheiden zwischen Männern und sogenannten "Fotzenknechten". Die auf den  Bildern da oben sind zweites.  Echte Männer brauchen keine Pailettenhoodies, keine rosa Cowboyboots und auch keine 18Zoll Alufelgen von ihrem Audi im Ohrwaschellapperl.
        
Text
Nein.Es war eine Single-Börse,  die den ganzen deutschsprachigen Raum umfasst- ich glaube Lovescout. Hat aber nur zu virtuellen Verbindungen geführt- eben we

In [149]:
for index, row in test_index_e5.iterrows():
    print('Text')
    print(row['text'])
    print('\n Retrieved "similar" ones')
    print(row['similar_text_1'])
    print(row['similar_text_2'])

Text
Das ist ein richtig gutes Portrait von Greta!

 Retrieved "similar" ones
 Also dass die Veilchen ja fast noch wärmer als die Grünen spielen können ist ja echt ein Kunststück!
        
 Jungfrau Greta vom Ikealand Je mehr sie sich da hineinsteigert um so mehr CO2 sondert sie ab.
        
Text
bei den dort üblichen kalaschnikows wärs eher eine zahl mit ein paar nullen mehr ...

 Retrieved "similar" ones
 In Österreich sind es eher die Neonazis mit Kalashnikovs die sich auf solchen .. Festen herumtreiben.
        
 herzloser mittelfinga wär ein besserer nick
        
Text
Nein.Es war eine Single-Börse,  die den ganzen deutschsprachigen Raum umfasst- ich glaube Lovescout. Hat aber nur zu virtuellen Verbindungen geführt- eben weil die betreffenden Männer hunderte Kilometer von mir entfernt gewohnt haben.Und dann die Websingles mit 2 wirklichen Volltreffern.Und dort ist es mittlerweile ähnlich wie auf Facebook: Es treiben sich hauptsächlich ältere Semester dort herum.Ich hab auch die Ge

With the e5 multilingual model, VectorStoreIndex seems to yield slightly better results, does this hold true for the predictions too?

# Let's make some predictions and evaluate them!

# First with the "dbmdz/bert-base-german-uncased" model

In [None]:
if run_this:
    predictions_Bert_5_df = run_rag_pipeline(train_df, test_data, "Bert", top_k=5)
    predictions_Bert_10_df = run_rag_pipeline(train_df, test_data, "Bert", top_k=10)
    predictions_Bert_20_df = run_rag_pipeline(train_df, test_data, "Bert", top_k=20)

In [178]:
print('with k = 5')
compute_f1(dev_df.iloc[:100], predictions_Bert_5_df)
print('with k = 10')
compute_f1(dev_df.iloc[:100], predictions_Bert_10_df) #performs best here and overall
print('with k = 20')
compute_f1(dev_df.iloc[:100], predictions_Bert_20_df)

with k = 5
Dev set F1 score Bin Maj: 0.6504735195771761
Dev set F1 score Bin One: 0.5367466666666667
Dev set F1 score Bin All: 0.8348235294117647
Dev set F1 score Multi Maj: 0.6423015873015874
Dev set F1 score Disagree Bin: 0.632291543814129
with k = 10
Dev set F1 score Bin Maj: 0.7036011396011397
Dev set F1 score Bin One: 0.5529845755581921
Dev set F1 score Bin All: 0.8570327552986512
Dev set F1 score Multi Maj: 0.7003333333333334
Dev set F1 score Disagree Bin: 0.5868253968253968
with k = 20
Dev set F1 score Bin Maj: 0.7124080882352942
Dev set F1 score Bin One: 0.5031111111111111
Dev set F1 score Bin All: 0.8389743589743591
Dev set F1 score Multi Maj: 0.6386732919254658
Dev set F1 score Disagree Bin: 0.5875908099088692


# Now with the "intfloat/multilingual-e5-large" model

In [None]:
if run_this:
    predictions_e5_5_df = run_rag_pipeline(train_df, test_data, "notBert", top_k=5)
    predictions_e5_10_df = run_rag_pipeline(train_df, test_data, "notBert", top_k=10)
    predictions_e5_20_df = run_rag_pipeline(train_df, test_data, "notBert", top_k=20)

In [180]:
print('with k = 5')
compute_f1(dev_df.iloc[:100], predictions_e5_5_df) #performs best here
print('with k = 10')
compute_f1(dev_df.iloc[:100], predictions_e5_10_df)
print('with k = 20')
compute_f1(dev_df.iloc[:100], predictions_e5_20_df)

with k = 5
Dev set F1 score Bin Maj: 0.7029249011857708
Dev set F1 score Bin One: 0.5956626506024096
Dev set F1 score Bin All: 0.8341621621621621
Dev set F1 score Multi Maj: 0.6412717536813922
Dev set F1 score Disagree Bin: 0.5785291425083918
with k = 10
Dev set F1 score Bin Maj: 0.7135690396559962
Dev set F1 score Bin One: 0.5674819541793075
Dev set F1 score Bin All: 0.8188235294117647
Dev set F1 score Multi Maj: 0.6366279069767442
Dev set F1 score Disagree Bin: 0.5483636363636363
with k = 20
Dev set F1 score Bin Maj: 0.6420833333333335
Dev set F1 score Bin One: 0.57984
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.6428571428571428
Dev set F1 score Disagree Bin: 0.6078817733990148


# Conclusion

Pro: No issues with reproducibility here
Contra: Results are worse compared to the results from "comparisonAPI.ipynb".