# Notebook for Analysis of Compression Benchmarks

This notebook is used to generate comprehensive results from the logs collected during compression and decompression. This generates interactive graphs for analysis which can be used to understand the behaviour of a compressors and how the Compression/Decompression speed and the compression ratio vary over events and with respect to input load. 

## Installing Required packages

In [None]:
%%capture
%pip install chart_studio
%pip install datapane
%pip install pandas

In [None]:
import json
import matplotlib.pyplot as plt
from pprint import pprint
from decimal import Decimal
from operator import truediv
import os
import numpy as np
import statistics
import pandas as pd
import plotly.graph_objects as go
import re
import os

## Analysis helpers

In [None]:
def getCompressionRatioAverage(inputSizeList,outputSizeList):
    compressionRatioList = list()
    for i,output in enumerate(outputSizeList):
        compressionRatioList.append(Decimal(output/inputSizeList[i]))
    averageCompressionratio = statistics.mean(compressionRatioList)
    return averageCompressionratio

In [None]:
def getCompressionRatioList(inputSizeList,outputSizeList):
    compressionRatioList = list()
    for i,output in enumerate(outputSizeList):
        compressionRatioList.append(Decimal(output/inputSizeList[i]))
    return compressionRatioList

In [None]:
def divide_and_plot_classes(x, y, y1): #inputSizeList, Speed list, Ratio list
    # Calculate the number of data points per class
    num_points = len(x)
    points_per_class = num_points // 9

    # Sort the data based on 'x'
    sorted_indices = np.argsort(x)
    sorted_x = np.array(x)[sorted_indices]
    sorted_y = np.array(y)[sorted_indices]
    sorted_y1 = np.array(y1)[sorted_indices]

    # Divide the data into five classes
    classes = []
    for i in range(0, num_points, points_per_class):
        class_x = sorted_x[i:i+points_per_class]
        class_y = sorted_y[i:i+points_per_class]
        class_y1 = sorted_y1[i:i+points_per_class]
        classes.append((class_x, class_y,class_y1))

    # Calculate the mean and standard deviation for each class
    means_y = [np.mean(class_y) for _, class_y, zz in classes]
    means_y1 = [np.mean(class_y1) for _, zz, class_y1 in classes]
    class_names = ['Class {}'.format(i+1) for i in range(len(classes))]
    
    return means_y,means_y1,class_names

In [None]:
import plotly.graph_objects as go

def plot_3d_curve(x_list, y_list, z_list):
    # Create a trace for the 3D curve
    trace = go.Scatter3d(
        x=x_list,
        y=y_list,
        z=z_list,
        mode='lines+markers',
        line=dict(
            color='blue',
            width=2
        )
    )

    # Create the layout for the 3D plot
    layout = go.Layout(
        scene=dict(
            xaxis=dict(title='X'),
            yaxis=dict(title='Y'),
            zaxis=dict(title='Z')
        )
    )

    # Create the figure
    fig = go.Figure(data=[trace], layout=layout)

    # Show the figure
    fig.show()


## Target 

In [None]:
logsFolder = "<Folder containing compression and decompression JSON LOGS>"

In [None]:
logsFolderImages = logsFolder+"images"+os.sep
if not os.path.exists(logsFolderImages):
    os.makedirs(logsFolderImages)

## Compression Benchmarks

In [None]:
#Get a list of all the JSON files in the folder
import glob
import plotly.graph_objects as go
# from plotly.graph_objects import Scattergl as go
jsonFiles = glob.glob(logsFolder + '/*.json')
# Get all the json files with the Compressor Key set to the target
targetFiles = []

fig = go.Figure()
fig1 = go.Figure()
fig2 = go.Figure()
fig3 = go.Figure()

columns = ['Compressor', 'File Name', 'Compression Level', 'Average Compression Ratio','Average Compression Speed']
compressionDf = pd.DataFrame(columns=columns)
loc=0

dataLedger = []
debugCompressionRatioList = []

classData = []

for jsonfile in jsonFiles:
    with open(jsonfile) as f:
        jsonParse = json.load(f)
        if not ":: Decompression Run " in jsonParse["Compressor"]:
            print (f'{f.name} ---> {jsonParse["Compressor"]}')
            targetFiles.append(jsonfile)
## Plotiing Metrics
for jsonFile in targetFiles:
    with open(jsonFile) as f:
        #print("\nNow analyzing  ",f.name)
        compression_log = json.load(f)
        EVENT_MAX=100
        if EVENT_MAX > int(compression_log["eventCount"]):
            print ("Too many events in file visualizing results for first 1k events")
            EVENT_MAX = int(compression_log["eventCount"])
        fileName = os.path.basename(compression_log["Filename"])
        #print("File for Compression  ",fileName)
        #print("Compressor Config   :"+compression_log["compressorConfig"])
        
        compressionDf.loc[loc,'Compressor'] = compression_log["Compressor"]
        compressionDf.loc[loc,'File Name'] = fileName
        pattern = r"compressionLevel:(\d+)"
        #levelCompression = re.findall(pattern,compression_log["compressorConfig"])[0]
        levelCompression = '10'
        try:
            levelCompression = re.findall(pattern,compression_log["compressorConfig"])[0]
        except:
            pass
        compressionDf.loc[loc,'Compression Level'] = levelCompression
        
        eventList = compression_log["evdata"] # The event List
        # No We design a filter dictionary to take up events based on filters
        # Leave Empty if All Events are to be considered
        eventListAnalysis = []
        filter_dict = {
            #  "trig": "2"
        }
        if len(filter_dict.keys()) != 0:
            print("Filtering Events based on")
            pprint(filter_dict)
            for filter_field in filter_dict.keys():
                for event in eventList:
                    if event["eventHeader"][filter_field] == filter_dict[filter_field]:
                        eventListAnalysis.append(event)
        else:
            print("No Filter given - Analysis for all events")
            eventListAnalysis = eventList
        # Populate Metrics and Show preliminary results
        compressionRatioList = [Decimal(event["compressionRatio"]) for event in eventListAnalysis]
        debugCompressionRatioList.append(compressionRatioList)
        timeTakenList = [Decimal(event["timeTaken"]) for event in eventListAnalysis]
        inputSizeList = [Decimal(event["inputSize"]) for event in eventListAnalysis]
        outputSizeList = [Decimal(event["outputSize"]) for event in eventListAnalysis]
        compressionSpeedList  = list(map(truediv,inputSizeList,timeTakenList))
        
        classData.append([divide_and_plot_classes(inputSizeList,compressionSpeedList,compressionRatioList),compression_log["compressorConfig"],fileName])
        #print("Average Compression Speed  ",statistics.mean(compressionSpeedList ))
        #print("Standard Deviation in data  ",statistics.stdev(compressionSpeedList ))
        meanhere = statistics.mean(compressionSpeedList )
        stdevhere = statistics.stdev(compressionSpeedList )
        #print("Average Compression Ratio  ",statistics.mean(compressionRatioList))
        #print("Standard Deviation in data  ",statistics.stdev(compressionRatioList))
        meanhereratio = statistics.mean(compressionRatioList )
        stdevhereratio = statistics.stdev(compressionRatioList)
        x_axis = range(len(eventListAnalysis))[0:EVENT_MAX] # Setting a limit for data visualization
        
        compressionRatioList = compressionRatioList[0:EVENT_MAX]
        compressionSpeedList = compressionSpeedList[0:EVENT_MAX]
        x_axis_1 = inputSizeList[0:EVENT_MAX]
        
        compressionDf.loc[loc,'Average Compression Speed'] = meanhere
        compressionDf.loc[loc,'Average Compression Ratio'] = meanhereratio
        
        dataLedger.append((list(x_axis),compressionSpeedList,compressionRatioList))
        loc=loc+1
        
        fig.add_trace(go.Scattergl(
            x=list(x_axis),
            y=compressionSpeedList,
            mode='lines+markers',
            name=compression_log["compressorConfig"],
            hovertemplate =
            '<i>Compression Speed</i>:%{y:.3f}'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['Event Tag: {} <br> Event Trigger: {}  <br> Mean: {} <br> StDev: {} <br> FileName: {}'.format(event["eventHeader"]["tag"],event["eventHeader"]["trig"],meanhere,stdevhere,fileName) for event in eventListAnalysis],
        ))
        fig1.add_trace(go.Scattergl(
            x=list(x_axis),
            y=compressionRatioList,
            mode='lines+markers',
            name=compression_log["compressorConfig"],
            hovertemplate =
            '<i>Compression Ratio</i>:%{y:.3f}'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['Event Tag: {} <br> Event Trigger: {}  <br> Mean: {} <br> StDev: {} <br> FileName: {}'.format(event["eventHeader"]["tag"],event["eventHeader"]["trig"],meanhereratio,stdevhereratio,fileName) for event in eventListAnalysis],
                ))
        fig2.add_trace(go.Scattergl(
            x=list(x_axis_1),
            y=compressionSpeedList,
            mode='markers',
            name=compression_log["compressorConfig"],
            hovertemplate =
            '<i>Compression Speed</i>:%{y:.3f}'+
            '<br><b>Input Size</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['Event Tag: {} <br> Event Trigger: {}  <br> Mean: {} <br> StDev: {} <br> FileName: {}'.format(event["eventHeader"]["tag"],event["eventHeader"]["trig"],meanhere,stdevhere,fileName) for event in eventListAnalysis],
                ))
        fig3.add_trace(go.Scattergl(
            x=list(x_axis_1),
            y=compressionRatioList,
            mode='markers',
            name=compression_log["compressorConfig"],
            hovertemplate =
            '<i>Compression Ratio</i>:%{y:.3f}'+
            '<br><b>Input Size</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['Event Tag: {} <br> Event Trigger: {}  <br> Mean: {} <br> StDev: {} <br> FileName: {}'.format(event["eventHeader"]["tag"],event["eventHeader"]["trig"],meanhereratio,stdevhereratio,fileName) for event in eventListAnalysis],
                ))
        
        
        
fig.update_layout(
    font_family="Times New Roman",
    title='Compression Speed in bytes per microsecond',
    xaxis_title='Event No',
    yaxis_title='Compression Speed',
    hoverlabel_align = 'right'
)
fig1.update_layout(
    font_family="Times New Roman",
    title='Compression Ratio',
    xaxis_title='Event No',
    yaxis_title='Compression Ratio',
    hoverlabel_align = 'right'
)
fig2.update_layout(
    font_family="Times New Roman",
    title='Compression Speed',
    xaxis_title='Event Input Size',
    yaxis_title='Compression Speed',
    hoverlabel_align = 'right'
)
fig3.update_layout(
    font_family="Times New Roman",
    title='Compression Ratio',
    xaxis_title='Event Input Size',
    yaxis_title='Compression Ratio',
    hoverlabel_align = 'right'
)
fig.write_image(logsFolderImages+"Compression Speed for events.png",format='png')
fig1.write_image(logsFolderImages+"Compression Ratio for events.png",format='png')
fig2.write_image(logsFolderImages+"Compression Speed vs input size.png",format='png')
fig3.write_image(logsFolderImages+"Compression Ratio vs input size.png",format='png')


### Plots and Visualizations

In [None]:
fig.show()

In [None]:
fig1.show()

### Data

In [None]:
compressionDf

### Tradeoff Analysis

In [None]:
# import plotly.express as px
# corelationfig = px.scatter(compressionDf, x='Average Compression Ratio', y='Average Compression Speed')
# corelationfig.show()
import plotly.graph_objects as go
classfig = go.Figure()
classfig2 = go.Figure()
classfig3 = go.Figure()
print("Plotting data for Input Size Classes - Class 1 to 10 ")
for data in classData:
    classfig.add_trace(go.Scatter(
            x=data[0][0],
            y=data[0][1],
            mode='markers',
            name=data[1],
            hovertemplate =
            '<i>Avg Compression Ratio</i>:%{y:.3f}'+
            '<br><b>Avg Compression Speed</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['{} <br> FileName: {}'.format(x,data[2]) for _,x in enumerate(data[0][2])],
                ))
    classfig2.add_trace(go.Bar(
            x=data[0][2],
            y=data[0][1],
            name=data[1],
            hovertemplate =
            '<i>Avg Compression Ratio</i>:%{y:.3f}'+
            #'<br><b>Avg Compression Speed</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['{} <br> FileName: {}'.format(x,data[2]) for _,x in enumerate(data[0][2])],
                ))
    classfig3.add_trace(go.Bar(
            x=data[0][2],
            y=data[0][0],
            name=data[1],
            hovertemplate =
            '<i>Avg Compression Speed</i>:%{y:.3f}'+
            #'<br><b>Avg Compression Speed</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['{} <br> FileName: {}'.format(x,data[2]) for _,x in enumerate(data[0][2])],
                ))

    
classfig.update_layout(
    font_family="Times New Roman",
    title='Avg Compression Ratio vs Avg Compression Speed ',
    xaxis_title='Avg Compression Speed',
    yaxis_title='Avg Compression Ratio',
    hoverlabel_align = 'right'
)
classfig2.update_layout(
    font_family="Times New Roman",
    title='Avg Compression Ratio ',
    xaxis_title='Input Size Classes',
    yaxis_title='Avg Compression Ratio',
    hoverlabel_align = 'right'
)
classfig3.update_layout(
    font_family="Times New Roman",
    title='Avg Compression Speed ',
    xaxis_title='Input Size Classes',
    yaxis_title='Avg Compression Speed',
    hoverlabel_align = 'right'
)
classfig.write_image(logsFolderImages+"Avg Compression Ratio vs Avg Compression Speed.png",format='png')
classfig2.write_image(logsFolderImages+"Compression Speed class div.png",format='png')
classfig3.write_image(logsFolderImages+"Compression Ratio class div.png",format='png')



#### Plots and Visualizations

In [None]:
classfig.show()

In [None]:
classfig2.show()

In [None]:
classfig3.show()

In [None]:
fig2.show()

In [None]:
fig3.show()

## Decompression Metrics

In [None]:
import glob
import plotly.graph_objects as go
jsonFiles = glob.glob(logsFolder + '/*.json')
# Get all the json files with the Compressor Key set to the target
targetFiles = []
fig4 = go.Figure()
fig5 = go.Figure()
for jsonfile in jsonFiles:
    with open(jsonfile) as f:
        jsonParse = json.load(f)
        if ":: Decompression Run " in jsonParse["Compressor"]:
            print (f'{f.name} ---> {jsonParse["Compressor"]}')
            targetFiles.append(jsonfile)
## Plotiing Metrics

columns = ['Compressor', 'File Name', 'Compression Level', 'Average Compression Ratio','Average Decompression Speed']
decompressionDf = pd.DataFrame(columns=columns)
loc=0

classData = []
debugDecRatio = []

for jsonFile in targetFiles:
    with open(jsonFile) as f:
        #print("Now analyzing  ",f.name)
        compression_log = json.load(f)
        EVENT_MAX=100
        if EVENT_MAX > int(compression_log["eventCount"]):
            print ("Too many events in file visualizing results for first 1k events")
            EVENT_MAX = int(compression_log["eventCount"])
        fileName = os.path.basename(compression_log["Filename"])
        
        decompressionDf.loc[loc,'Compressor'] = compression_log["Compressor"]
        decompressionDf.loc[loc,'File Name'] = fileName
        pattern = r"compressionLevel:(\d+)"
        levelCompression = '10'
        try:
            levelCompression = re.findall(pattern,compression_log["compressorConfig"])[0]
        except:
            pass
        decompressionDf.loc[loc,'Compression Level'] = levelCompression
        
        #print("File for Compression  ",fileName)
        #print("Compressor Config   :"+compression_log["compressorConfig"])
        eventList = compression_log["evdata"] # The event List
        # No We design a filter dictionary to take up events based on filters
        # Leave Empty if All Events are to be considered
        eventListAnalysis = []
        filter_dict = {
            #  "trig": "2"
        }
        if len(filter_dict.keys()) != 0:
            print("Filtering Events based on")
            pprint(filter_dict)
            for filter_field in filter_dict.keys():
                for event in eventList:
                    if event["eventHeader"][filter_field] == filter_dict[filter_field]:
                        eventListAnalysis.append(event)
        else:
            print("No Filter given - Analysis for all events")
            eventListAnalysis = eventList
        # Populate Metrics and Show preliminary results
        compressionRatioList = [Decimal(event["compressionRatio"]) for event in eventListAnalysis]
        timeTakenList = [Decimal(event["timeTaken"]) for event in eventListAnalysis]
        inputSizeList = [Decimal(event["inputSize"]) for event in eventListAnalysis]
        outputSizeList = [Decimal(event["outputSize"]) for event in eventListAnalysis]
        averageCompRatio = getCompressionRatioAverage(inputSizeList,outputSizeList)
        decompressionSpeedList  = list(map(truediv,outputSizeList,timeTakenList))
        decompressionRatioList = getCompressionRatioList(inputSizeList,outputSizeList)
        debugDecRatio.append(decompressionRatioList)
        
        classData.append([divide_and_plot_classes(inputSizeList,decompressionSpeedList ,decompressionRatioList),compression_log["compressorConfig"],fileName])
        
        meanhere = statistics.mean(decompressionSpeedList)
        stdevhere = statistics.stdev(decompressionSpeedList)
        #print("Average Compression Ratio is ",averageCompRatio)
        #print("Average Decompression Speed  ",statistics.mean(decompressionSpeedList))
        #print("Standard Deviation in data  ",statistics.stdev(decompressionSpeedList))
        x_axis = range(len(eventListAnalysis))[0:EVENT_MAX]
        decompressionSpeedList = decompressionSpeedList[0:EVENT_MAX]
        
        decompressionDf.loc[loc,'Average Decompression Speed'] = meanhere
        decompressionDf.loc[loc,'Average Compression Ratio'] = averageCompRatio
        loc=loc+1
        
        fig4.add_trace(go.Scatter(
            x=list(x_axis),
            y=decompressionSpeedList,
            mode='lines+markers',
            name=compression_log["compressorConfig"],
            hovertemplate =
            '<i>Decompression Ratio</i>:%{y:.3f}'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['Event Tag: {} <br> Event Trigger: {}  <br> Mean: {} <br> StDev: {} <br> FileName: {} '.format(event["eventHeader"]["tag"],event["eventHeader"]["trig"],meanhere,stdevhere,fileName) for event in eventListAnalysis],
        ))
        fig5.add_trace(go.Scatter(
            x=list(outputSizeList)[0:EVENT_MAX],
            y=decompressionSpeedList,
            mode='markers',
            name=compression_log["compressorConfig"],
            hovertemplate =
            '<i>Decompression Speed</i>:%{y:.3f}'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['Event Tag: {} <br> Event Trigger: {}  <br> Mean: {} <br> StDev: {} <br> FileName: {}'.format(event["eventHeader"]["tag"],event["eventHeader"]["trig"],meanhere,stdevhere,fileName) for event in eventListAnalysis],
        ))
        
fig4.update_layout(
    font_family="Times New Roman",
    title='Decompression Speed in bytes per microsecond',
    xaxis_title='Event No',
    yaxis_title='Decompression Speed',
    hoverlabel_align = 'right'
)
fig5.update_layout(font_family="Times New Roman",
    title='Decompression Speed in bytes per microsecond',
    xaxis_title='Event Output Size',
    yaxis_title='Decompression Speed',
    hoverlabel_align = 'right'
)
fig4.write_image(logsFolderImages+"Decompression Speed for events.png",format='png')
fig5.write_image(logsFolderImages+"Decompression Speed vs Output Size.png",format='png')

# Show the plot
fig4.show()

### Plots

In [None]:
fig5.show()

### Data

In [None]:
decompressionDf

### Tradeoff Analysis

In [None]:
import plotly.graph_objects as go
declassfig = go.Figure()
declassfig2 = go.Figure()
declassfig3 = go.Figure()
print("Plotting data for Input Size Classes - Class 1 to 10 ")
for data in classData:
    declassfig.add_trace(go.Scatter(
            x=data[0][0],
            y=data[0][1],
            mode='markers',
            name=data[1],
            hovertemplate =
            '<i>Avg Compression Ratio</i>:%{y:.3f}'+
            '<br><b>Avg Decompression Speed</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['{} <br> FileName: {}'.format(x,data[2]) for _,x in enumerate(data[0][2])],
                ))
    declassfig2.add_trace(go.Bar(
            x=data[0][2],
            y=data[0][1],
            name=data[1],
            hovertemplate =
            '<i>Avg Compression Ratio</i>:%{y:.3f}'+
            #'<br><b>Avg Compression Speed</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['{} <br> FileName: {}'.format(x,data[2]) for _,x in enumerate(data[0][2])],
                ))
    declassfig3.add_trace(go.Bar(
            x=data[0][2],
            y=data[0][0],
            name=data[1],
            hovertemplate =
            '<i>Avg Decompression Speed</i>:%{y:.3f}'+
            #'<br><b>Avg Compression Speed</b>: %{x:.1f}<br>'+
            '<br>'+
            '<b>%{text}</b>',
            text = ['{} <br> FileName: {}'.format(x,data[2]) for _,x in enumerate(data[0][2])],
                ))

    
declassfig.update_layout(
    font_family="Times New Roman",
    title='Avg Compression Ratio vs Avg Decompression Speed ',
    xaxis_title='Avg Decompression Speed',
    yaxis_title='Avg Compression Ratio',
    hoverlabel_align = 'right'
)
declassfig2.update_layout(
    font_family="Times New Roman",
    title='Avg Compression Ratio ',
    xaxis_title='Input Size Classes',
    yaxis_title='Avg Compression Ratio',
    hoverlabel_align = 'right'
)
declassfig3.update_layout(
    font_family="Times New Roman",
    title='Avg Decompression Speed ',
    xaxis_title='Input Size Classes',
    yaxis_title='Avg Compression Speed',
    hoverlabel_align = 'right'
)
declassfig.write_image(logsFolderImages+"Avg Compression Ratio vs Avg Decompression Speed.png",format='png')
declassfig3.write_image(logsFolderImages+"Avg Decompression Speed class div.png",format='png')

declassfig.show()

In [None]:
declassfig3.show()