# Notebook to process and clean csv's

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import csv

In [3]:
!pwd

/c/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/scripts


# Performance Predictions for mnist, cifar10 and imagenet - Heatmaps
## Read csv > process data > save to another csv

In [23]:
def dataframe_contains(input_df: pd.DataFrame, column: str, value: str)->pd.DataFrame:
    """
    Given a dataframe, this function returns a subset of that dataframe by column
    
    Parameters
    ----------
    input_df : pd.DataFrame
        Inpput dataframe from which the subset will be taken.       
    column : str
        Column name by which the subsetting will be taken.  
    value : str
        String that contains what we need from the column. 
        Eg.:'dog'
            'banana|apple|peach'

    Returns
    -------
    output_df:  pd.DataFrame
        This dataframe will be a subset from the input dataframe according to the column value given.  
        
    """
    output_df = input_df[input_df[column].str.contains(value)]
    return output_df

In [31]:
def performance_predictions(path_csv: str):
    """
    This function 
        -reads the csv file;
        -computes the PERFORMANCE PREDICTIONS - HEATMAPS for the input csv file;
        -separates the dataframe into cifar 10, mnist and imagenet dataframes;
        -saves all three dataframes into separate csv files.
    
    Parameters
    ----------
    path_csv : str
        This is a path to the csv file that will be read by this function 

    Returns
    -------

    """
    ## Reading csv file and converting data to (Neural network, Platform, Value)
    df = pd.read_csv(path_csv)

    #----- Creating a dataframe with 3 columns x, y gop_frame
    cleanedList = [x for x in df.platform if x==x] # to take all the nans out
    x, y = np.meshgrid(df.model, cleanedList) 
    gop_frame, _ = np.meshgrid(df.gop_frame, cleanedList)

    #to crate a 1D array from each variable, creating a dataframe with 3 columns
    source = pd.DataFrame({'x': x.ravel(),     
                           'y': y.ravel(),
                           'gop_frame':gop_frame.ravel()}) #auxilary column

    #---Adding a fourth column: top_second  ---- auxilary column
    tops_second= []    #creating a lsit which will contain all top_second columns from the dataframe
    columns = list(df) # creating a list of dataframe columns 

    for i in columns:   
        if 'top_second' in i:
            tops_second.append(df[i])

    source['top_second'] = pd.concat(tops_second,ignore_index=True)

    #------Adding a fith column: values-----------
    source['values'] = source.top_second * 1000 / source.gop_frame

    #---Drop auxilary columns: gop_frame top_scond----
    source = source.drop(columns=['gop_frame','top_second'])
    source = source.round(0)

    #Separate dataframe into: IMAGENET, MNIST, CIFAR10 dataframes
    df_imagenet = dataframe_contains(input_df=source, column='x', value='GoogleNetv|MobileNetv1|ResNet50|EfficientNet')
    df_cifar10 = dataframe_contains(input_df=source, column='x', value='CNV')
    df_mnist = dataframe_contains(input_df=source, column='x', value='MLP')

    #tasks = ['imagenet', 'cifar-10','mnist']
    #path_imagenet = path + '/performance_prediction_' + tasks[0] + '.csv'

    #Saving above dataframes to csv file
    df_imagenet.to_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/processed_csv/performance_prediction_imagenet.csv', index = False)
    df_cifar10.to_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/processed_csv/performance_prediction_cifar10.csv', index = False)
    df_mnist.to_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/processed_csv/performance_prediction_mnist.csv', index = False)
    source.to_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/processed_csv/performance_prediction_imagenet_mnist_cifar10.csv', index = False)

In [32]:
performance_predictions('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/performance_predictions_imagenet_mnist_cifar.csv')

In [24]:
#hide
## Reading csv file and converting data to (Neural network, Platform, Value)
df = pd.read_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/performance_predictions_imagenet_mnist_cifar.csv')

#----- Creating a dataframe with 3 columns x, y gop_frame
cleanedList = [x for x in df.platform if x==x] # to take all the nans out
x, y = np.meshgrid(df.model, cleanedList) 
gop_frame, _ = np.meshgrid(df.gop_frame, cleanedList)

#to crate a 1D array from each variable, creating a dataframe with 3 columns
source = pd.DataFrame({'x': x.ravel(),     
                       'y': y.ravel(),
                       'gop_frame':gop_frame.ravel()}) #auxilary column

#---Adding a fourth column: top_second  ---- auxilary column
tops_second= []    #creating a lsit which will contain all top_second columns from the dataframe
columns = list(df) # creating a list of dataframe columns 
  
for i in columns:   
    if 'top_second' in i:
        tops_second.append(df[i])
        
source['top_second'] = pd.concat(tops_second,ignore_index=True)

#------Adding a fith column: values-----------
source['values'] = source.top_second * 1000 / source.gop_frame

#---Drop auxilary columns: gop_frame top_scond----
source = source.drop(columns=['gop_frame','top_second'])
source = source.round(0)

#Separate dataframe into: IMAGENET, MNIST, CIFAR10 dataframes
df_imagenet = dataframe_contains(input_df=source, column='x', value='GoogleNetv|MobileNetv1|ResNet50|EfficientNet')
df_cifar10 = dataframe_contains(input_df=source, column='x', value='CNV')
df_MNIST = dataframe_contains(input_df=source, column='x', value='MLP')

#Saving above dataframes to csv file
df_imagenet.to_csv('data/processed_csv/performance_prediction_imagenet.csv', index = False)
df_cifar10.to_csv('data/processed_csv/performance_prediction_cifar10.csv', index = False)
df_MNIST.to_csv('data/processed_csv/performance_prediction_mnist.csv', index = False)
source.to_csv('data/processed_csv/performance_prediction_imagenet_mnist_cifar10.csv', index = False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed_csv/performance_prediction_imagenet.csv'

In [16]:
df_imagenet.head()

Unnamed: 0,x,y,values
0,GoogleNetv1-INT8,Ultra96-INT8,307.0
1,GoogleNetv1-FP16,Ultra96-INT8,
2,GoogleNetv1-FP32,Ultra96-INT8,
3,MobileNetv1-INT8,Ultra96-INT8,842.0
4,ResNet50 100%-INT8,Ultra96-INT8,124.0


# --------------------------------------------------
# --------------------------------------------------
# --------------------Rooflines----------------

In [7]:
## Loading Hardware platforms and Neural networks csv
df = pd.read_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/peakPerfBandHardPlatf.csv', sep=',')
df_topology=pd.read_csv('c:/Users/alinav/Documents/GitHub/Qutibench_Web/_notebooks/data/topology_details.csv',)

In [8]:
## Calculate the Arithmetic intensity (x axis) for each NN based on Fwd ops and Total params
i=0.1
n_bytes=1 
calc_arith = lambda operations, params, n_bytes: operations/(params*n_bytes)

for index, row in df_topology.iterrows():             #nditer is a iterator object    
    arith_intens = calc_arith(row['Fwd Ops'], row['Total Params'], n_bytes)   #calculate the arith intensity with the lambda function
    df_topology.at[index, 'arith_intens'] = arith_intens              #saving it to the dataframe
    
#to duplicate the dataframe so each row with (Platform, arith_intens) will be filled with 100 and then 0s to plot the vertical line later    
df_topology = pd.concat([df_topology, df_topology])
df_topology = pd.concat([df_topology, df_topology])
df_topology = df_topology.drop(columns=['Total Params','Fwd Ops']) #deleting unnecessary columns (Fwd ops and Total params)

In [9]:
## Preparing the NNs dataset to be ploted as vertical lines later
# creating a y list [100,100,100,100....0.0001,0.0001,0.0001...] to plot a vertical line later
df_topology['performance'] = [100] * round((len(df_topology.index))/4)  +  [25] * round((len(df_topology.index))/4) + [75] * round((len(df_topology.index))/4) +[0.000001] * round((len(df_topology.index))/4) 

In [11]:
df_topology.head(50)

Unnamed: 0,Name,arith_intens,performance
0,MobileNet V1,23878,100.0
1,AlexNet,2995,100.0
2,GoogLeNet V1,29988,100.0
3,ResNet-18,39950,100.0
4,ResNet-34,43169,100.0
5,VGG16_BN,7183,100.0
6,ResNet-101,22671,100.0
7,ResNet-34-SSD,51070,100.0
8,ResNet-50,41475,100.0
9,ResNet-152,24587,100.0


In [12]:
## Calculating the rooflines (y axis) for each hardware platform (dataframe = df_topology + df)
#--------------------------------Calculating the values to plot for the roofline model-----------
maxX=160000
x_axis = np.arange(0.1,maxX,1) #to create a list that represents the x axis with numbers between 0 and 1000
dataframe = pd.DataFrame(columns=['Name','arith_intens','performance']) 

for index, row in df.iterrows():             #nditer is a iterator object 
    dataframe = dataframe.append([pd.Series([df.at[index,'Name'],1,row['Bandwidth'] ],dataframe.columns)], ignore_index=True)
    for i in np.nditer(x_axis):
        point = row['Bandwidth'] * i
        if point > row['Peak_Performance']:
            dataframe = dataframe.append([pd.Series([df.at[index,'Name'],i,row['Peak_Performance']],dataframe.columns)], ignore_index=True)
            dataframe = dataframe.append([pd.Series([df.at[index,'Name'],maxX, df.at[index,'Peak_Performance']],dataframe.columns)], ignore_index=True)
            break

In [13]:
## Merging NNs dataset with Hardware Platforms dataset
dataframe = pd.concat([dataframe,df_topology])

In [14]:
dataframe.to_csv('data/processed_csv/rooflines_hardware_neural_networks.csv', index = False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed_csv/rooflines_hardware_neural_networks.csv'

# ---------------Rooflines-------------------------
# -------------------------------------------------------
# -------------------------------------------------------