# **Guiding Research Question**

*   **Application question** (*Speed is the priority*): What configuration should I use for my BFS kernel?  For similar BW requirements (reads/writes per second), what other configs might I consider?

*   In particular we are examining the difference between RRAM and SRAM under the following optimization targets:

    * **Read Latency**: The time it takes for the memory system to read data.
    
    * **Read Dynamic Energy**: How much energy is consumed per read request.

    * **ReadEDP**: The product of read latency and read energy. This quantifies a balance between fast and efficient reads.

    * **Area**: How much area on the chip each memory cell takes up.


In [12]:
#import statements
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
from ipywidgets import interact, interactive, fixed, interact_manual, IntRangeSlider, widgets, HBox, VBox, Layout
import ipywidgets as widgets
from math import log10, floor, ceil
import re
import copy
from matplotlib.lines import Line2D
import itertools
from IPython.display import display, Markdown
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, CheckboxGroup, CustomJS, Legend, LegendItem, CDSView, BooleanFilter, Range1d, LinearAxis, Div
import bokeh.models as bmo
from bokeh.layouts import column, row
from bokeh.palettes import d3
import warnings
warnings.filterwarnings('ignore')

Read in the data:

In [13]:
path = "CSVFiles/" #path, change if needed
technologies = ['RRAM', 'SRAM'] #change this line if more technologies are needed
df_list = []

for technology in technologies:
    temp_df = pd.read_csv(path+technology+'_1BPC-combined.csv')
    # Filter the data to get rid of configurations with negative values (indicating they are eroneous)
    temp_df = temp_df[temp_df["Area (mm^2)"] > 0]
    df_list.append(temp_df)
    
# Concatenate the data frames together
dfs = pd.concat(df_list)
dfs['OptimizationTarget'] = dfs['OptimizationTarget'].str.replace(' ', '')
dfs['MemCellType'] = dfs['MemCellType'].str.replace(' ', '')

def findCapacity(x):
    if '8MB' in x:
        return 8
    elif '16MB' in x:
        return 16
    elif '32MB' in x:
        return 32
    elif '64MB' in x:
        return 64
    else:
        return ''
    
dfs['Capacity2'] = dfs['Benchmark Name'].astype(str).apply(findCapacity)
dfs = dfs[dfs['Capacity2'] == dfs['Capacity (MB)']]

#Key columns:
  #Tech: RRAM vs SRAM
  #Capacity (MB): 1, 2, 4, 8, 16 MB trials
  #Optimization Target: ReadEnergy, ReadLatency, Area, ReadEDP

dfs.columns

Index(['MemCellType', 'CellArea (F^2)', 'CellAspectRatio', 'AccessType',
       'AccessCMOSWidth (F)', 'ResistanceOnAtSetVoltage (ohm)',
       'ResistanceOffAtSetVoltage (ohm)', 'ResistanceOnAtResetVoltage (ohm)',
       'ResistanceOffAtResetVoltage (ohm)', 'ResistanceOnAtReadVoltage (ohm)',
       'ResistanceOffAtReadVoltage (ohm)',
       'ResistanceOnAtHalfResetVoltage (ohm)', 'CapacitanceOn (F)',
       'CapacitanceOff (F)', 'ReadMode', 'ReadVoltage (V)', 'ReadPower (uW)',
       'ResetMode', 'ResetVoltage (V)', 'ResetPulse (ns)', 'ResetEnergy (pJ)',
       'SetMode', 'SetVoltage (V)', 'SetPulse (ns)', 'SetEnergy (pJ)',
       'MemoryCellInputFile', 'DesignTarget', 'DeviceRoadmap', 'LocalWireType',
       'LocalWireRepeaterType', 'LocalWireUseLowSwing', 'GlobalWireType',
       'GlobalWireRepeaterType', 'GlobalWireUseLowSwing', 'Routing',
       'InternalSensing', 'Temperature (K)', 'BufferDesignOptimization',
       'ProcessNode', 'OptimizationTarget', 'WordWidth (bit)', 'Capacit

**Q1: Which config should I use for my BFS kernel to make it as fast as possible?**

Since we are looking at just BFS, we filter the data to only include BFS benchmarks. Here those benchmarks are Facebook--BFS8MB and Wikipedia--BFS8MB. Since both of these benchmarks are for 8MB capacity, we are only able to consider configurations with 8MB capacity. After filtering the data, we calculate total latency as a sum of read latency and write latency and then plot total latency vs technology and optimization target. 

In [14]:
df = dfs.copy(deep=True)
df = df[df['Benchmark Name'].str.contains('BFS')]
df = df[df['Capacity (MB)'] == 8]
df['Total Latency'] = df['Total Read Latency (ms)'] + df['Total Write Latency (ms)']
targets = df['OptimizationTarget'].unique().tolist()
technologies = df['MemCellType'].unique().tolist()
         
def barGraph(**kwargs):
    selected = [x for x, selected in kwargs.items() if selected]
    selected_techs = [tech for tech in selected if tech in technologies]
    selected_opt_targets = sorted([opt_target for opt_target in selected if opt_target in targets])
    temp = df.groupby(by = ['MemCellType', 'OptimizationTarget']).agg({'Total Latency': 'mean'}).reset_index()
    temp = temp[temp['OptimizationTarget'].isin(selected_opt_targets)]
    barWidth = 0.4
    r = np.arange(len(selected_opt_targets))
    r2 = r
    fig, ax = plt.subplots(dpi=300)
    
    for tech in selected_techs:
        ax.bar(r2, temp[temp['MemCellType'] == tech]['Total Latency'], width=barWidth, edgecolor='white', label=tech)
        r2 = r2 + barWidth        
    
    ax.set_xlabel('Optimization Target')
    ax.set_xticks(r + barWidth/2)
    ax.set_xticklabels(selected_opt_targets)
    ax.legend(loc="upper right")
    plt.title("Total Latency for BFS Benchmarks")
    plt.ylabel("Average Total Latency (ms)")
    plt.show()

targets = {target: widgets.ToggleButton(value=True, description=target) for target in targets}
technologies = {technology: widgets.ToggleButton(value=True, description=technology) for technology in technologies}
panel = interactive(barGraph, **technologies, **targets)
display(HBox(panel.children[:-1], layout = Layout(flex_flow='row wrap')))
display(panel.children[-1])


HBox(children=(ToggleButton(value=True, description='memristor'), ToggleButton(value=True, description='SRAM')…

Output()

When optimizing for Area, ReadEDP, or ReadLatency RRAM is faster than SRAM while RRAM is faster when optimizing for ReadDynamicEnergy. Area is the slowest optimization target by a large number, followed by ReadDynamicEnergy, ReadEDP and ReadLatency. ReadLatency and ReadEDP being the fastest optimization targets makes sense as those optimize in whole or part of latency.

**Q2: What about similar bandwith requirements?**

We are going to define similar bandwith requirements based on the ratio of reads to writes. The number of reads to writes is the primary information we have about each benchmark and using the ratio makes more sense as we care about the type of work being done rather than the amount as BFS can be done on either a small or large graph. First we will look at the ratios in the BFS benchmarks:

In [15]:
#First, what is the average ratio of reads/writes per second for the two BFS
# Initialize a dictionary to store the results
average_ratios = {}
benchmark_filter = ['Facebook--BFS8MB','Wikipedia--BFS8MB']
bfs_df= dfs[dfs['Benchmark Name'].isin(benchmark_filter)]
# Loop through each BFS benchmark name
for benchmark in benchmark_filter:
    # Filter the DataFrame for the current benchmark name
    filtered_df = bfs_df[bfs_df['Benchmark Name'] == benchmark].copy()

    # Calculate the ratio of read accesses to write accesses
    # Assuming 'Read Accesses' and 'Write Accesses' are the column names
    filtered_df['Read to Write Ratio'] = filtered_df['Read Accesses'] / filtered_df['Write Accesses']

    # Calculate the average ratio, ignoring any division by zero
    average_ratio = filtered_df['Read to Write Ratio'].mean()

    # Store the result in the dictionary
    average_ratios[benchmark] = average_ratio

# Print the average ratios for each benchmark
for benchmark, avg_ratio in average_ratios.items():
    print(f'Average ratio of read accesses to write accesses for {benchmark}: {avg_ratio}')

'''
Average ratio of read accesses to write accesses for Facebook--BFS8MB: 47.72727272727274
Average ratio of read accesses to write accesses for Wikipedia--BFS8MB: 18.055555555555557
'''

#So, lets say our range for "similar" access patterns to BFS is from a ratio of 18-50 (seems kind of big?)
AllBenchmarks = dfs['Benchmark Name'].unique()
for benchmark in AllBenchmarks:
    # Filter the DataFrame for the current benchmark name
    filtered_df = dfs[dfs['Benchmark Name'] == benchmark].copy()

    # Calculate the ratio of read accesses to write accesses
    # Assuming 'Read Accesses' and 'Write Accesses' are the column names
    filtered_df['Read to Write Ratio'] = filtered_df['Read Accesses'] / filtered_df['Write Accesses']

    # Calculate the average ratio, ignoring any division by zero
    average_ratio = filtered_df['Read to Write Ratio'].mean()

    # Store the result in the dictionary
    average_ratios[benchmark] = average_ratio

# Print the average ratios for each benchmark
for benchmark, avg_ratio in average_ratios.items():
    if avg_ratio > 18 and avg_ratio < 50 and benchmark != 'Facebook--BFS8MB' and benchmark != 'Wikipedia--BFS8MB':
        print(f'Average ratio of read accesses to write accesses for {benchmark}: {avg_ratio}')


#So now, we can do the original calculations for each benchmark in this new list
SimilarBandwithBenchmarks = ['500.perlbench_r16MB_l2','541.leela_r16MB_l2',  '502.gcc_r16MB_l2']
bfs_similarbandwidth = dfs[dfs['Benchmark Name'].isin(SimilarBandwithBenchmarks)].copy()


Average ratio of read accesses to write accesses for Facebook--BFS8MB: 47.72727272727273
Average ratio of read accesses to write accesses for Wikipedia--BFS8MB: 18.055555555555557
Average ratio of read accesses to write accesses for 500.perlbench_r16MB_l2: 29.89689308258811
Average ratio of read accesses to write accesses for 541.leela_r16MB_l2: 18.630374174614822
Average ratio of read accesses to write accesses for 502.gcc_r16MB_l2: 28.179089278818434


Now we plot the read latency for similar benchmarks based on technology, optimization target, and capaciiy, giving the user control over what ratios they define as similar enough:

In [16]:
output_notebook()

df = dfs.copy(deep='True')
df = df.drop_duplicates(subset = ['Benchmark Name', 'MemCellType', 'OptimizationTarget'])
df["Color"] = df["MemCellType"].apply(lambda x: '#0e14b5' if x == 'RRAM' else '#d40820')
df['Read/Write'] = df['Read Accesses']/df['Write Accesses']
targets = df['OptimizationTarget'].unique().tolist()
techs = df['MemCellType'].unique().tolist()
d3['Category10'][1] = ('#0e14b5')
d3['Category10'][2] = ('#0e14b5', '#d40820')

palette = d3['Category10'][len(df['MemCellType'].unique())]
color_map = bmo.CategoricalColorMapper(factors=df['MemCellType'].unique(),
                                   palette=palette)

#Create hover tools
TOOLTIPS = [('Benchmark', '@{Benchmark Name}'), ('Read Latency', '@{Total Read Latency (ms)}')]
hvr = HoverTool(tooltips=TOOLTIPS)
hvr.renderers = []

figures = []


def scatterPlot(slider, **kwargs):
    figures = []
    min_val = slider[0]
    max_val = slider[1]
    if min_val == max_val:
        min_val -= 1
    selected_techs = [tech for tech, selected in kwargs.items() if selected]
    for target in targets:
        temp = df[df['OptimizationTarget'].str.contains(target)]
        temp = temp[(temp['Read/Write'] > min_val) & (temp['Read/Write'] < max_val)]
        temp = temp[temp['MemCellType'].isin(selected_techs)]
        temp = temp.sample(n=min(1000, len(temp)), random_state=1)
        '''ys = []
        for tech in techs:
            slope, i = np.polyfit(temp[temp['Tech'] == tech]['Capacity (MB)'], temp[temp['Tech'] == tech]['Total Read Latency (ms)'], 1)
            ys.append(temp[temp['Tech'] == tech]['Capacity (MB)'] * slope + i)'''
        source = ColumnDataSource(temp)
        fig = figure(title = target, width=300, height=300, background_fill_color="#fafafa", x_axis_label = 'Capacity (MB)')
        a= fig.scatter('Capacity (MB)', 'Total Read Latency (ms)', size=10, fill_color={'field': 'MemCellType', 'transform': color_map},
              legend_group='MemCellType', source = source)
        hvr.renderers.append(a)
        if len(figures) > 0:
            figures[0].yaxis.axis_label = "Total Read latency"
            figures[0].yaxis.axis_label_text_font_size = "16pt"
        fig.add_tools(hvr)
        figures.append(fig)
    show(row(figures))

technologies = {technology: widgets.ToggleButton(value=True, description=technology) for technology in techs}
slider = widgets.IntRangeSlider(
    value=[18, 47],
    min=1,
    max=75,
    step=1,
    description='Read/Write:',
)
panel = interactive(scatterPlot, slider=slider, **technologies)
display(HBox(panel.children[:-1], layout = Layout(flex_flow='row wrap')))
display(panel.children[-1])



HBox(children=(IntRangeSlider(value=(18, 47), description='Read/Write:', max=75, min=1), ToggleButton(value=Tr…

Output()

Regardless of how the user defines similar benchmarks, we see the same results as when considering only BFS benchmarks. RRAM is generally faster except for when optimizing for read energy, and the fastest optimization target is read latency closely followed by read EDP. Area is still the slowest benchmark.