# Let us also check what RAG can do for this project - KeywordTableIndex Version

In [46]:
!pip install python-dotenv pandas llama-index langchain langchain-community llama-index-embeddings-langchain  sentence-transformers llama-index-llms-openai




[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from llama_index.core import KeywordTableIndex, StorageContext, load_index_from_storage, Settings
from llama_index.core.schema import TextNode
from collections import Counter
from pathlib import Path
from llama_index.llms.openai import OpenAI 

In [2]:
from functions import load_jsonl, combine_data, compute_f1, check_df

# Global Variables 

In [3]:
#data
train_file_path = Path('data_germeval/train.jsonl')
dev_file_path = Path('data_germeval/development.jsonl')

#save/load paths
save_path = Path("keyword_index")

#which openAI model to use
llm_model = "gpt-3.5-turbo"

#do we want to compute the vector index?
run_this = True

# Let's load the API

In [4]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_25")
print(f"API Key loaded: {OPENAI_API_KEY is not None}")

API Key loaded: True


# The usual setup

In [5]:
train_data = load_jsonl(train_file_path)
train_data_labeled = combine_data(train_data)
train_df = combine_data(train_data, dataframe = True)

dev_data = load_jsonl(dev_file_path)
dev_data_labeled = combine_data(dev_data)
dev_df = combine_data(dev_data, dataframe = True)

test_data = [dev_data_labeled[i]['text'] for i in range(100)]

# MAIN PART

# Let's get started with KeywordTableIndex

What happens here? Basically we index our data by keywords. An LLM is used for this purpose and hence, we do need an API key. After the indexing, the resulting keyword table points to the different data. If a new text is queried, the retriever filters out its "relevant" words and matches query text and data based on those (via the keyword table).

The LLM/API key is only used for indexing, not retrieving. The indexing costs about 0.4$.

In [6]:
def prepare_training_nodes(df):
    """
    Prepares our data for indexing.
    :param df: dataframe 
    :return: TextNode object of our dataframe
    """
    training_nodes = []
    
    for index, row in df.iterrows():
        text_content = f"""
        Example Text: {row['text']}
        """
        
        node = TextNode(
        text=text_content,
        metadata={
        'original_text': row['text'],
        'bin_maj_label': row['bin_maj_label'],
        'bin_one_label': row['bin_one_label'],
        'bin_all_label': row['bin_all_label'],
        'multi_maj_label': row['multi_maj_label'],
        'disagree_bin_label': row['disagree_bin_label'],
        'index': index
        }
        )
        training_nodes.append(node)
    
    return training_nodes

In [7]:
def load_index(load_path):
    """
    Loads precomputed index
    :param load_path: str, path to precomputed index
    :return: loaded index 
    """
    storage_context = StorageContext.from_defaults(persist_dir=load_path)
    index = load_index_from_storage(
        storage_context, 
    )
    print(f"Index loaded from {load_path}!")
    return index

In [8]:
def create_reload_keyword_index(nodes):
    """
    Creates or reloads a vector index using our nodes and LlamaIndex
    :param nodes: list of nodes 
    :return: generated or reloaded index
    """
    llm = OpenAI(api_key=OPENAI_API_KEY, model=llm_model)
    
    Settings.llm = llm
    
    if os.path.exists(save_path):
        print("Loading existing keyword index...")
        storage_context = StorageContext.from_defaults(persist_dir=save_path)
        return load_index_from_storage(storage_context)  
    else:
        print("Creating keyword index...")
        index = KeywordTableIndex(nodes)  
        
        index.storage_context.persist(persist_dir=save_path)
        print(f"Keyword index saved to {save_path}!")
        
        return index

In [9]:
def retrieve_similar_examples(query_text, index, top_k=5):
    """
    Retrieves similar examples using KeywordTableIndex
    :param query_text: str, query text
    :param index: KeywordTableIndex to use for retrieval
    :param top_k: int, number of similar examples
    :return: list of similar examples
    """
    # For KeywordTableIndex, just use as_retriever() without special parameters
    retriever = index.as_retriever(similarity_top_k=top_k)
    similar_nodes = retriever.retrieve(query_text)

    results = []
    for node in similar_nodes:
        results.append({
            'text': node.node.text,
            'metadata': node.node.metadata,
            'keyword_match_score': node.score
        })
    
    return results

In [10]:
def majority_aggregation(value_list):
    """
    Computes the majority vote of value_list. This makes the most sense as we are dealing with categorical data. 
    :param value_list: list of values
    :return: float
    """
    return Counter(value_list).most_common(1)[0][0]

In [11]:
def predict_from_similar_examples(similar_examples):
    """
    Predict target value from similar examples
    :param similar_examples: list of similar examples
    :return: list of predictions
    """
    if not similar_examples:
        return None

    target_values = [[ex['metadata']['bin_maj_label'] for ex in similar_examples],
                     [ex['metadata']['bin_one_label'] for ex in similar_examples],
                     [ex['metadata']['bin_all_label'] for ex in similar_examples],
                     [ex['metadata']['multi_maj_label'] for ex in similar_examples],
                     [ex['metadata']['disagree_bin_label'] for ex in similar_examples]]
    
    return [majority_aggregation(target_values) for target_values in target_values]
        

In [12]:
def data_to_dataframe(prediction):
    """
    takes the output of run_rag_pipeline and turns it into a pandas dataframe with fitting columns for comparison
    :param prediction: list of dictionaries of the form: {'query_text': , 'prediction':, 'num_similar_examples': 'similar_examples': } 
    :return: dataframe
    """
    prediction = [{'text': p['query_text'], 
                'bin_maj_label': p['prediction'][0],
                'bin_one_label': p['prediction'][1],
                'bin_all_label': p['prediction'][2], 
                'multi_maj_label':p['prediction'][3],
                'disagree_bin_label': p['prediction'][4],
                'similar_text_1': p['similar_examples'][0]['text'],
                'similar_text_2': p['similar_examples'][-1]['text']} for p in prediction]
    return pd.DataFrame(prediction, columns=prediction[0].keys())

In [13]:
def run_rag_pipeline(df, test_texts, top_k=5):
    """
    Complete RAG pipeline using LlamaIndex
    :param df: reference dataframe with text elements and labels
    :param test_texts: list of test texts
    :param top_k: int, number of similar examples
    :return: dataframe  
    """
    print("Step 1: Preparing training nodes...")
    training_nodes = prepare_training_nodes(df)
    
    print("Step 2: Creating/Reloading vector index...")
    index = create_reload_keyword_index(training_nodes)
    
    print("Step 3: Making predictions...")
    predictions = []
    
    for test_text in test_texts:
        similar_examples = retrieve_similar_examples(test_text, index, top_k)
        
        # if no similar examples are found
        if not similar_examples: 
            print(f"    No similar examples found for: {test_text[:50]}...")
            predictions.append({'query_text': test_text, 
                                 'prediction': [None, None, None, None, None],
                                 'similar_examples': [{'text':None}, {'text':None}]})
        else:
            prediction = predict_from_similar_examples(similar_examples)
            predictions.append({
                'query_text': test_text,
                'prediction': prediction,
                'num_similar_examples': len(similar_examples),
                'similar_examples': similar_examples[:2] 
            })
    #print(predictions)
    return data_to_dataframe(predictions)

# Let's see how well the indexing worked:

In [136]:
if run_this:
    test_index = run_rag_pipeline(train_df,test_data[:10])

Step 1: Preparing training nodes...
Step 2: Creating/Reloading vector index...
Creating keyword index...
Keyword index saved to keyword_index!
Step 3: Making predictions...


In [138]:
for index, row in test_index.iterrows():
    print('Text')
    print(row['text'])
    print('\n Retrieved "similar" ones')
    print(row['similar_text_1'])
    print(row['similar_text_2'])

Text
Das ist ein richtig gutes Portrait von Greta!

 Retrieved "similar" ones

        Example Text: Wird sich an Bord gscheit durchputzen lassen die Liebe Greta
        

        Example Text: Die Wahrscheinlichkeit dass ich irgendwann mal in Greta stecke ist wesentlich größer als dass Greta in mir stecken kann.
        
Text
bei den dort üblichen kalaschnikows wärs eher eine zahl mit ein paar nullen mehr ...

 Retrieved "similar" ones

        Example Text: warum seid ihr solche kleingeister? dann sinds 17,50 sekunden oder auch zwanzig... die zahl stand metaphorisch dafür wie klein die stadt ist aber ihr begriffsbehinderten hängt euch bei den kilometern auf... alter wie eng kann der horizont sein oder wie klein muss das selbstbewusstsein sein wenn man sich über sowas aufregt oder profiliert...
        

        Example Text: Warum fahren Frauen immer noch auf solche Events? Und nein, nicht die Frauen sind schuld - aber warum muss man sich immer noch, bei solchen Events diesem Risiko 

The keyword search went well. However, with KeywordTableIndex the most "similar" comments, are often very different in actual meaning. As we use the majority vote of the "most similar" comments for prediction, I would expect KeywordTableIndex to perform worse than VectorScoreIndex.

# Let's make some predictions and evaluate them!

In [14]:
if run_this:
    test_index_5 = run_rag_pipeline(train_df,test_data,top_k=5)
    test_index_10 = run_rag_pipeline(train_df,test_data,top_k=10)
    test_index_20 = run_rag_pipeline(train_df,test_data,top_k=20)

Step 1: Preparing training nodes...
Step 2: Creating/Reloading vector index...
Loading existing keyword index...
Loading llama_index.core.storage.kvstore.simple_kvstore from keyword_index\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from keyword_index\index_store.json.
Step 3: Making predictions...
    No similar examples found for: Kardinal schönbirn von östarreich ist mörderisch v...
    No similar examples found for: Wie wär's mit Vizespasti?...
Step 1: Preparing training nodes...
Step 2: Creating/Reloading vector index...
Loading existing keyword index...
Loading llama_index.core.storage.kvstore.simple_kvstore from keyword_index\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from keyword_index\index_store.json.
Step 3: Making predictions...
    No similar examples found for: Kardinal schönbirn von östarreich ist mörderisch v...
    No similar examples found for: Wie wär's mit Vizespasti?...
Step 1: Preparing training nodes...
Step 2

In [15]:
print('with k = 5')
compute_f1(*check_df(dev_df.iloc[:100], test_index_5))
print('with k = 10')
compute_f1(*check_df(dev_df.iloc[:100], test_index_10))
print('with k = 20')
compute_f1(*check_df(dev_df.iloc[:100], test_index_20)) #best model

with k = 5
number of removed entries: 2
Dev set F1 score Bin Maj: 0.7132726296260131
Dev set F1 score Bin One: 0.6261293523751996
Dev set F1 score Bin All: 0.8308490246927499
Dev set F1 score Multi Maj: 0.6594396794862524
Dev set F1 score Disagree Bin: 0.5827975025626688
with k = 10
number of removed entries: 2
Dev set F1 score Bin Maj: 0.6991540802367459
Dev set F1 score Bin One: 0.6261293523751996
Dev set F1 score Bin All: 0.8246031746031747
Dev set F1 score Multi Maj: 0.6594396794862524
Dev set F1 score Disagree Bin: 0.5737071966580164
with k = 20
number of removed entries: 2
Dev set F1 score Bin Maj: 0.7232339089481946
Dev set F1 score Bin One: 0.616755655125068
Dev set F1 score Bin All: 0.8308490246927499
Dev set F1 score Multi Maj: 0.6665495539533179
Dev set F1 score Disagree Bin: 0.5957751521661295


# Conclusion


Predictions get better with increasing number of similar examples retrieved.

As expected KeywordTableIndex performs worse than VectorScoreIndex and "comparisonAPI.ipynb". Considering it also cost money (which VectorScoreIndex did not), KeywordTableIndex with majority vote seems not to be the best tool for handling this task.   

KeywordTableIndex would most likely profit from a combined approach. First using this index to retrieve relevant examples and then sending them via a prompt template to an LLM to predict the outcome. This will be my last step.