In [None]:
!pip install pandas numpy matplotlib plotly

In [30]:
import os
from typing import List, Tuple
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [1]:
files_to_read = ['rqc_off_stats.txt', 'rqc_on_stats copy.txt']

def read_file(file_name):
    with open(file_name) as f:
        lines = f.readlines()
        return lines

def get_rocksdb_compact_write_bytes(lines) -> List[str]:
    return [line for line in lines if 'rocksdb.compact.write.bytes' in line]

def get_rocksdb_flush_write_bytes(lines) -> List[str]:
    return [line for line in lines if 'rocksdb.flush.write.bytes' in line]

def transform_bytes(lines) -> List[int]:
    return [int(line.strip(' \n').split(' : ')[1]) for line in lines]

def get_total_bytes(compact_bytes, flush_bytes) -> List[int]:
    return [compact_bytes[i] + flush_bytes[i] for i in range(len(compact_bytes))]

def get_levels_stats(lines) -> List[Tuple[str, int, int]]:
    return [line.strip(' \t') for line in lines if 'Level-' in line or 'Total:' in line]


In [10]:
experiments = os.listdir('./')

results = list()

for experiment in experiments:

    if not os.path.isdir(experiment):
        continue
    
    i, inserts, u, updates, s, range_queries, t, size_ratio, *others = experiment.split(' ')

    if len(others) == 2:
        rq, rqc = others
        utl, utl_value = None, 0
        ltu, ltu_value = None, 0
    else:
        utl, utl_value, ltu, ltu_value, rq, rqc = others

    if rqc =='0':
        approach = 'vanilla'
        lines = read_file(f'{experiment}/rqc_off_stats.txt')
        compacted_vs_skipped = pd.read_csv(f'{experiment}/rqc_off_compacted_vs_skipped.csv')
        stats = pd.read_csv(f'{experiment}/rqc_off_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter', 'NumFilesBefore', 'NumFilesAfter', 'NumEntriesBefore', 'NumEntriesAfter', 'TotalTimeTaken'])
    else:
        approach = 'rqdc'
        lines = read_file(f'{experiment}/rqc_on_stats.txt')
        compacted_vs_skipped = pd.read_csv(f'{experiment}/rqc_on_compacted_vs_skipped.csv')
        stats = pd.read_csv(f'{experiment}/rqc_on_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter', 'NumFilesBefore', 'NumFilesAfter', 'NumEntriesBefore', 'NumEntriesAfter', 'TotalTimeTaken'])

    compact_write_bytes = get_rocksdb_compact_write_bytes(lines)
    flush_write_bytes = get_rocksdb_flush_write_bytes(lines)
    levels_details = get_levels_stats(lines)

    compact_bytes = transform_bytes(compact_write_bytes)
    flush_bytes = transform_bytes(flush_write_bytes)

    range_queries_ = stats[stats['Type'] == ' Range']
    total_time_taken = range_queries_.reset_index()['TotalTimeTaken']
    timing_vs_compacted = compacted_vs_skipped.assign(TimeTaken=total_time_taken)

    results.append({
        'inserts': inserts,
        'updates': updates,
        'range_queries': range_queries,
        'size_ratio': size_ratio,
        'rqc': rqc,
        'utl': utl_value,
        'ltu': ltu_value,
        'approach': approach,
        'compact_write_bytes': compact_bytes,
        'flush_write_bytes': flush_bytes,
        'total_write_bytes': get_total_bytes(compact_bytes, flush_bytes),
        'levels_details': levels_details,
        'range_queries_time_vs_compacted_entries_count': timing_vs_compacted,
    })

    # df_plot = pd.DataFrame({
    #     'x': np.arange(len(timing_vs_compacted)),
    #     'Compacted': timing_vs_compacted['Compacted'].values,
    #     'Skipped': timing_vs_compacted['Skipped'].values,
    #     'TimeTaken': timing_vs_compacted['TimeTaken'].values
    # })

    # fig = px.bar(df_plot, x='x', y=['Compacted', 'Skipped'], title='Interactive Bar Plot with Scroll and Zoom',
    #             labels={'value': 'Number Of Entries Written', 'variable': 'Category', 'x': 'Range Query'},
    #             line_shape='linear', render_mode='svg')

    # # Add the time taken as a line on a secondary y-axis
    # fig.add_trace(px.line(df_plot, x='x', y='TimeTaken').data[0])

    # # Set the layout to include a range slider for scrolling
    # fig.update_layout(xaxis=dict(rangeslider=dict(visible=True)), showlegend=True)

    # fig.show()



    # fig, ax1 = plt.subplots(figsize=(25, 10))

    # x = np.arange(len(timing_vs_compacted))
    # plt.bar(x, timing_vs_compacted['Compacted'].values, 0.35, label='Compacted (valid)', color='b')
    # plt.bar(x + 0.35, timing_vs_compacted['Skipped'].values, 0.35, label='Skipped (invalid)', color='green')
    # ax1.set_xlabel('Range Query')
    # ax1.set_ylabel('Number Of Entries Written (each entry is 1K bytes)')

    # ax2 = ax1.twinx()
    # ax2.plot(x, timing_vs_compacted['TimeTaken'].values)
    # ax2.set_ylabel('Time Taken By Query')
    # ax2.tick_params('y')
    # ax2.set_ylim(0, 14)

    # plt.title(experiment)

    # plt.show()
    # break



  stats = pd.read_csv(f'{experiment}/rqc_on_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter', 'NumFilesBefore', 'NumFilesAfter', 'NumEntriesBefore', 'NumEntriesAfter', 'TotalTimeTaken'])
  stats = pd.read_csv(f'{experiment}/rqc_on_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter', 'NumFilesBefore', 'NumFilesAfter', 'NumEntriesBefore', 'NumEntriesAfter', 'TotalTimeTaken'])
  stats = pd.read_csv(f'{experiment}/rqc_off_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter', 'NumFilesBefore', 'NumFilesAfter', 'NumEntriesBefore', 'NumEntriesAfter', 'TotalTimeTaken'])
  stats = pd.read_csv(f'{experiment}/rqc_on_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter', 'NumFilesBefore', 'NumFilesAfter', 'NumEntriesBefore', 'NumEntriesAfter', 'TotalTimeTaken'])
  stats = pd.read_csv(f'{experiment}/rqc_on_stats.csv', names=['QueryNo', 'Type', 'Key', 'NumLevelsBefore', 'NumLevelsAfter

In [46]:
# size_ratios = ['3', '6']
size_ratios = ['6']
# utl_ltu_values_rqdc = [(0.4, 0.6), (0.4, 0.7), (0.4, 0.8), (0.5, 0.6), (0.5, 0.7), (0.5, 0.8), (0.6, 0.7), (0.6, 0.8), (0.7, 0.8)]
utl_ltu_values_rqdc = [(0.4, 0.8), (0.5, 0.6)]
utl_ltu_value_vanilla = [(0, 0)]

for size_ratio in size_ratios:
    rqdc_data = [exp for exp in results if exp['approach'] == 'rqdc' and exp['size_ratio'] == size_ratio]
    vanilla_data = [exp for exp in results if exp['approach'] == 'vanilla' and exp['size_ratio'] == size_ratio]

    for utl, ltu in utl_ltu_values_rqdc:
        file_name = f"SizeRatio-{size_ratio}-UTL-{utl}-LTU-{ltu}"
        rqdc_data_utl_ltu = [exp for exp in rqdc_data if exp['utl'] == str(utl) and exp['ltu'] == str(ltu)]
        rqdc_epochs = range(1, len(rqdc_data_utl_ltu[0]['total_write_bytes']) + 1)
        rqdc_total_bytes = rqdc_data_utl_ltu[0]['total_write_bytes']
        vanilla_total_bytes = vanilla_data[0]['total_write_bytes']

        # vanilla_levels_info_string = "                 Vanilla\n" + "".join(vanilla_data[0]['levels_details'])
        # info_str = vanilla_levels_info_string + "\n\n                   RQDC\n" + "".join(rqdc_data_utl_ltu[0]['levels_details'])

        # plt.figure(figsize=(10, 6))
        # plt.plot(rqdc_epochs, rqdc_total_bytes, label=f'rqdc - utl: {utl} ltu: {ltu}', marker='o', color='blue')
        # plt.plot(rqdc_epochs, vanilla_total_bytes, label='vanilla', marker='s', color='orange')
        # plt.title(f'Total Write Bytes - Size Ratio {size_ratio}, Threshold UTL {utl}, Threshold LTU {ltu}')
        # plt.xlabel('Epoch')
        # plt.ylabel('Total Bytes Written')
        # plt.legend()
        # plt.grid(True)

        # plt.text(1.02, 0.5, info_str, transform=plt.gca().transAxes, fontsize=10, verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        # plt.show()

        timing_vs_compacted_rqdc = rqdc_data_utl_ltu[0]['range_queries_time_vs_compacted_entries_count']
        timing_vs_compacted_vanilla = vanilla_data[0]['range_queries_time_vs_compacted_entries_count']


        df_plot = pd.DataFrame({
            'x': np.arange(len(timing_vs_compacted_rqdc)),
            'Compacted': timing_vs_compacted_rqdc['Compacted'].values,
            'TimeTakenRQDC': timing_vs_compacted_rqdc['TimeTaken'].values,
            'TimeTakenVanilla': timing_vs_compacted_vanilla['TimeTaken'].values,
        })

        # df_plot = pd.DataFrame({
        #     'x': np.arange(100),
        #     'Compacted': timing_vs_compacted_rqdc['Compacted'].values[900:1000],
        #     'TimeTakenRQDC': timing_vs_compacted_rqdc['TimeTaken'].values[900:1000],
        #     'TimeTakenVanilla': timing_vs_compacted_vanilla['TimeTaken'].values[900:1000],
        # })


        fig = go.Figure()

        # Add bar trace for Compacted
        fig.add_trace(go.Bar(
            x=df_plot['x'],
            y=df_plot['Compacted'],
            name='Compacted',
            marker_color='blue'
        ))

        # Add line trace for TimeTakenRQDC on the secondary y-axis
        fig.add_trace(go.Scatter(
            x=df_plot['x'],
            y=df_plot['TimeTakenRQDC'],
            mode='lines',
            name='Time Taken (RQDC)',
            yaxis='y2'
        ))

        # Add line trace for TimeTakenVanilla on the secondary y-axis
        fig.add_trace(go.Scatter(
            x=df_plot['x'],
            y=df_plot['TimeTakenVanilla'],
            mode='lines',
            name='Time Taken (Vanilla)',
            yaxis='y3'
        ))

        # Update layout for scrollable x-axis
        fig.update_layout(
            xaxis=dict(type='category'),
            xaxis_title='Range Query',
            yaxis_title='Number Of Entries Written',
            title=f'Total Write Bytes - Size Ratio {size_ratio}, Threshold UTL {utl}, Threshold LTU {ltu}, Entry Size - 256',
            showlegend=True,
            yaxis2=dict(range=[0, 14]),
            yaxis3=dict(range=[0, 14]) 
        )

        # Create secondary y-axes
        fig.update_layout(
            yaxis2=dict(
                title='Time Taken',
                overlaying='y',
                side='right'
            ),
            yaxis3=dict(
                # title='Time Taken (Vanilla)',
                overlaying='y',
                side='right',
                anchor='free',
                position=1
            )
        )

        fig.show()
        # fig.write_html(f'{file_name}.html')

        # epochs = np.arange(1, 13)

        # bar_width = 0.35
        # fig, ax = plt.subplots(figsize=(10, 6))

        # bar2 = ax.bar(epochs + bar_width/2, rqdc_total_bytes, bar_width, label='RQDC', color='blue')
        # bar1 = ax.bar(epochs - bar_width/2, vanilla_total_bytes, bar_width, label='Vanilla', color='orange')

        # # Adding labels and title
        # ax.set_xlabel('Epoch')
        # ax.set_ylabel('Total Bytes Written')
        # ax.set_title('Comparison between Vanilla and RQDC by Total Bytes Written'
        #              f'\nSize Ratio {size_ratio}, Threshold UTL {utl}, Threshold LTU {ltu}')
        # ax.set_xticks(epochs)
        # ax.legend()

        # # Show the plot
        # plt.show()


In [12]:
len(rqdc_data_utl_ltu)

1