In [19]:
import os
import pandas as pd

def load_bm25_data(root_folder):
    bm25_data = {}

    # Iterate through the retriever folders
    for retriever_folder in os.listdir(root_folder):
        retriever_path = os.path.join(root_folder, retriever_folder)

        # Check if it's a directory
        if os.path.isdir(retriever_path):
            retriever_data = {}

            # Iterate through language folders (de, en, etc.)
            for lang_folder in os.listdir(retriever_path):
                lang_path = os.path.join(retriever_path, lang_folder)

                # Check if it's a directory
                if os.path.isdir(lang_path):
                    index_data = {}

                    # Iterate through index folders
                    for index_folder in os.listdir(lang_path):
                        index_path = os.path.join(lang_path, index_folder)

                        # Check if it's a directory
                        if os.path.isdir(index_path):
                            # Load all CSV files in the index folder
                            index_files = {}
                            for csv_file in os.listdir(index_path):
                                if csv_file.endswith('.csv'):
                                    csv_path = os.path.join(index_path, csv_file)
                                    df = pd.read_csv(csv_path)
                                    index_files[csv_file] = df

                            index_data[index_folder] = index_files

                    retriever_data[lang_folder] = index_data

            bm25_data[retriever_folder] = retriever_data

    # Concatenate the DataFrames within the nested dictionary
    for retriever, retriever_data in bm25_data.items():
        for lang, lang_data in retriever_data.items():
            for index, index_files in lang_data.items():
                concatenated_df = pd.concat(index_files.values(), ignore_index=True)
                
                # Sort the DataFrame based on the "k" column
                concatenated_df = concatenated_df.sort_values(by='k')
                
                # Reset the index
                concatenated_df = concatenated_df.reset_index(drop=True)

                bm25_data[retriever][lang][index] = concatenated_df

    return bm25_data

root_folder = "./benchmarks-retriever"

# Load the BM25 data
data = load_bm25_data(root_folder)

# Access the concatenated, sorted, and reset index DataFrame for BM25 retriever, language 'de', index 'translated_IndexEnglishAll'
df_combined_sorted = data['bm25']['de']['translated_IndexEnglishAll']
print(df_combined_sorted)

   Retriever     k        HR       MRR
0       BM25     1  0.000039  0.000039
1       BM25     3  0.000117  0.000076
2       BM25     5  0.000187  0.000091
3       BM25    10  0.000420  0.000120
4       BM25    15  0.000606  0.000135
5       BM25    25  0.001011  0.000156
6       BM25    50  0.002356  0.000192
7       BM25   100  0.004681  0.000223
8       BM25   150  0.007503  0.000246
9       BM25   200  0.010528  0.000264
10      BM25   300  0.018451  0.000296
11      BM25   400  0.027773  0.000322
12      BM25   500  0.037633  0.000344
13      BM25  1000  0.093871  0.000422


In [26]:
data

{'dpr': {'de': {'translated_IndexEnglishAll':    Retriever    k        HR       MRR
   0   largeDPR    1  0.300796  0.300796
   1   largeDPR    3  0.439959  0.362059
   2   largeDPR    5  0.502232  0.376290
   3   largeDPR   10  0.586524  0.387556
   4   largeDPR   15  0.586524  0.387556
   5   largeDPR   25  0.586524  0.387556
   6   largeDPR   50  0.586524  0.387556
   7   largeDPR  100  0.586524  0.387556
   8   largeDPR  150  0.586524  0.387556
   9   largeDPR  200  0.586524  0.387556
   10  largeDPR  300  0.586524  0.387556
   11  largeDPR  400  0.586524  0.387556,
   'IndexGermanAll':    Retriever     k        HR       MRR
   0   largeDPR     1  0.235225  0.235225
   1   largeDPR     3  0.342329  0.282504
   2   largeDPR     5  0.389765  0.293309
   3   largeDPR    10  0.459314  0.302535
   4   largeDPR    15  0.459314  0.302535
   5   largeDPR    25  0.459314  0.302535
   6   largeDPR    50  0.459314  0.302535
   7   largeDPR   100  0.459314  0.302535
   8   largeDPR   150  0.45

In [32]:
def post_process_bm25_data(bm25_data):
    new_bm25_data = {}

    # Iterate through the retriever folders
    for retriever, retriever_data in bm25_data.items():
        if retriever.startswith('bm25_ce'):
            # Extract the k values from the original retriever name
            k_values = retriever.split('-CE-k')[0].split('-')[1:]

            print(k_values)
            # Create a new column "k_inner" in each DataFrame
            for lang, lang_data in retriever_data.items():
                for index, index_files in lang_data.items():
                    for df_name, df in index_files.items():
                        # Extract the k_inner value from the original retriever name if k_values is not empty
                        if k_values:
                            k_inner_value = k_values.pop(0)
                        
                            # Add a new column "k_inner" to the DataFrame
                            df['k_inner'] = int(k_inner_value)

            new_bm25_data[retriever] = retriever_data

    return new_bm25_data

In [33]:
bm25_data = load_bm25_data(root_folder)

In [34]:
bm25_data_post_processed = post_process_bm25_data(bm25_data)

In [36]:
df_combined_sorted = bm25_data_post_processed['bm25_ce_10']['de']['translated_IndexEnglishAll']


KeyError: 'bm25_ce_10'