In [2]:
import os
import pandas as pd

def parse_npy_log(file_path):
    npy_sizes_df = pd.read_csv(file_path, names=['filepath', 'original_npy'])
    
    # Extract the base filename from the filepath
    npy_sizes_df['filename'] = npy_sizes_df['filepath'].apply(lambda x: x.split('/')[-1].replace('.npy', ''))
    
    # build mapping from filename to original_npy size
    filename_to_size = dict(zip(npy_sizes_df['filename'], npy_sizes_df['original_npy']))
    return filename_to_size


def read_log_file(log_path):
    """ Reads the log file and returns a DataFrame. """
    columns = ['filepath', 'format', 'compression', 'level', 'file_size', 'runtime', 'shape0', 'shape1']
    df = pd.read_csv(log_path, names=columns)
    # print(f"df: {df}")
    return df

def collect_file_stats(log_df, dir_path, setup, stat_type='file_size', map = None):
    """ Collects file statistics based on the log DataFrame. """
    # Prepare a dictionary to hold the data
    data = {}
    # Define the compression settings and formats to check
    compressions = {
        'csv': ['none'],
        'feather': ['uncompressed', 'zstd', 'gzip', 'snappy', 'lz4', 'zlib'],
        'parquet': ['none', 'zstd', 'gzip', 'snappy', 'lz4', 'zlib'],
        'orc': ['uncompressed', 'zstd', 'gzip', 'snappy', 'lz4', 'zlib']
    }
    # Initialize the DataFrame columns
    columns = ['filename', 'original_npy']
    for fmt in compressions:
        for comp in compressions[fmt]:
            columns.append(f"{comp}_{fmt}")
            if comp == 'zstd':
                columns.append(f"{comp}_1_{fmt}")
                
    for item in orignal_npy_sizes_map:
        data[item] = {'filename': item, 'original_npy': orignal_npy_sizes_map[item]}
        # Please also add columns with default values 0: none_csv	uncompressed_feather	zstd_feather	zstd_1_feather	gzip_feather	snappy_feather	lz4_feather	zlib_feather	none_parquet	zstd_parquet	zstd_1_parquet	gzip_parquet	snappy_parquet	lz4_parquet	zlib_parquet	uncompressed_orc	zstd_orc	zstd_1_orc	gzip_orc	snappy_orc	lz4_orc	zlib_orc
        data[item]['none_csv'] = 0
        data[item]['uncompressed_feather'] = 0
        data[item]['zstd_feather'] = 0
        data[item]['zstd_1_feather'] = 0
        data[item]['gzip_feather'] = 0
        data[item]['snappy_feather'] = 0
        data[item]['lz4_feather'] = 0
        data[item]['zlib_feather'] = 0
        data[item]['none_parquet'] = 0
        data[item]['zstd_parquet'] = 0
        data[item]['zstd_1_parquet'] = 0
        data[item]['gzip_parquet'] = 0
        data[item]['snappy_parquet'] = 0
        data[item]['lz4_parquet'] = 0
        data[item]['zlib_parquet'] = 0
        data[item]['uncompressed_orc'] = 0
        data[item]['zstd_orc'] = 0
        data[item]['zstd_1_orc'] = 0
        data[item]['gzip_orc'] = 0
        data[item]['snappy_orc'] = 0
        data[item]['lz4_orc'] = 0
        data[item]['zlib_orc'] = 0
        
    for _, row in log_df.iterrows():
        # Extract the base filename from the filepath
        print("row['filepath']", row['filepath'])
        base_filename_full = row['filepath'].split('/')[-1].strip()
        print("base_filename_full:", base_filename_full)
        # if it start with "vec_" then remove it
        if base_filename_full.startswith('vec_'):
            base_filename = base_filename_full[4:]
        else:
            base_filename = base_filename_full
        # check if the base_filename_full starts with any key in the map, if so, get the corresponding row
        # print(f"looking for {base_filename} ")
        for key in orignal_npy_sizes_map.keys():
            if base_filename.startswith(key):
                base_filename = key
                # print(f"found {base_filename}")
                break
        # print(base_filename)
        
        
        comp = row['compression'].lower() if pd.notna(row['compression']) else 'none'
        comp = comp.strip()
        fmt = row['format'].strip()
        
        key = f"{comp}_{fmt}"
        if row['level'].__contains__('1'): 
            key = f"{comp}_1_{fmt}"
        # print(f"key: {key}")
        # print("data[base_filename]", data[base_filename])
        if stat_type == 'file_size':
            data[base_filename][key] = row['file_size']
        elif stat_type == 'runtime':
            data[base_filename][key] = row['runtime']
        elif stat_type == 'shape':
            data[base_filename][key] = row['shape0'] + row['shape1']
     

    # Create a DataFrame from the collected data
    df = pd.DataFrame.from_dict(data, orient='index', columns=columns)
    return df

def save_stats_to_csv(df, output_path):
    """ Saves the DataFrame to a CSV file. """
    df.to_csv(output_path, index=False)
    print(f"File stats saved to {output_path}")

# Configuration
setups = ['vec_', 'value_' ]
stat_types = ['file_size', 'runtime', 'shape']
for setup in setups:
    for stat_type in stat_types:
        print(f"Processing {setup} {stat_type}...")
        dir_path = '../embeddings/'
        log_path = f'batch_processing_{setup}log-full.csv'
        output_csv_path = f'{dir_path}{setup}file_{stat_type}_all.csv'
        npy_sizes_path = 'batch_npy_size_log_backup.csv'
        orignal_npy_sizes_map = parse_npy_log(npy_sizes_path)
        # print(orignal_npy_sizes_map)

        # Read log file
        log_df = read_log_file(log_path)

        # Collect file stats
        file_stats_df = collect_file_stats(log_df, dir_path, setup, stat_type=stat_type, map = orignal_npy_sizes_map)  # Change 'size' to 'runtime' if needed

        # Save to CSV
        save_stats_to_csv(file_stats_df, output_csv_path)



Processing vec_ file_size...
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_uncompressed_n.feather
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_uncompressed_n.feather
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_zstd_n.feather
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_zstd_n.feather
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_lz4_n.feather
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_lz4_n.feather
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_none_n.parquet
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_none_n.parquet
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_w

In [5]:
setups = ['vec_', 'value_' ]
stat_types = ['file_size', 'runtime', 'shape']
for setup in setups:
    for stat_type in stat_types:
        print(f"Processing {setup} {stat_type}...")
        dir_path = '../embeddings/'
        log_path = f'batch_read_{setup}log.csv'
        output_csv_path = f'{dir_path}read_{setup}file_{stat_type}_all.csv'
        npy_sizes_path = 'batch_npy_size_log_backup.csv'
        orignal_npy_sizes_map = parse_npy_log(npy_sizes_path)
        # print(orignal_npy_sizes_map)

        # Read log file
        log_df = read_log_file(log_path)

        # Collect file stats
        file_stats_df = collect_file_stats(log_df, dir_path, setup, stat_type=stat_type, map = orignal_npy_sizes_map)  # Change 'size' to 'runtime' if needed

        # Save to CSV
        save_stats_to_csv(file_stats_df, output_csv_path)

Processing vec_ file_size...
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_uncompressed_n.feather
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_uncompressed_n.feather
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_zstd_n.feather
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_zstd_n.feather
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_lz4_n.feather
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_lz4_n.feather
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_none_n.parquet
base_filename_full: vec_Eval_Pref_Dataset_with_stella_400M_v5_prompt_response_1_a_embeddings_none_n.parquet
row['filepath'] ../embeddings/vec_Eval_Pref_Dataset_w