In [1]:
from result_records import TFRecordLoader

In [2]:
ds = TFRecordLoader('memorization_results.tfrecords')

2021-10-09 16:05:22.405734: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Loading Data 
> consists of 4063300 records

In [3]:
data = []
indicies = []
import numpy as np
from tqdm import tqdm
for i,(res,idx) in tqdm(enumerate(ds)):
    res,idx = res.numpy(),idx.numpy()
    if(not (np.isnan(res) or np.isinf(res))):
        data.append(res)
        indicies.append(idx)
data = np.array(data)
indicies = np.array(indicies)

2021-10-09 16:05:22.491630: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
4063300it [05:08, 13183.97it/s]


# Memorization Metric plots
> Plotting average values of memorization metric over a bucketed range of values

In [4]:
from IPython.display import display
import matplotlib.pyplot as plt
import ipywidgets as widgets
%matplotlib inline

import numpy as np

class Plotter:
    def __init__(self,title,xlabel,ylabel,y,x=None,size=25,default_slider_value=None):
        self.title = title
        self.xlabel = xlabel
        self.ylabel = ylabel
        self.default_slider_value = default_slider_value
        self.y = y
        self.x = x
        if(x is None):
            self.x = [i for i in range(len(data))]
        
        self.size = 25
        self.params = {'legend.fontsize': 'large',
          'figure.figsize': (15,5),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 25,
          'font.family':'sans-serif',
          'font.weight':'bold',
          'text.color':'aqua'
         }
    
    def plot_data(self,scale):
        scale = 2**scale  #Converting log scale to normal scale
        buckets = []
        length = len(self.y)
        bucket_size = length//scale
        index = []

    
        for i in range(0,length,bucket_size):
            buckets.append(self.y[i:i+bucket_size].mean())
            index.append(self.x[min(i+bucket_size-1,len(indicies)-1)])

        plt.plot(index,buckets)
        plt.rcParams.update(self.params)
        plt.title(self.title)
        plt.xlabel(self.xlabel)
        plt.ylabel(self.ylabel)
        plt.show()
    
    def clicked(self,b):
        self.out.clear_output()
        scale = self.slider.value
        with self.out:
            self.plot_data(scale)
    def run(self):
        self.out = widgets.Output()
        button = widgets.Button(description="Plot Value")
        slider_max = int(np.log2(len(self.y)))
        
        if(self.default_slider_value is not None):
            default_slider_value = self.default_slider_value
        else:
            default_slider_value = np.random.choice([i for i in range(1,slider_max)])
        self.slider = widgets.IntSlider(min=1, max=slider_max, 
                                   value=default_slider_value,
                                   description="Scale",
                                   layout=widgets.Layout(width='50%'))

        box_layout = widgets.Layout(
            display='flex',
            flex_flow='column',
            align_items='center',
            width='80%'
        )

        box = widgets.VBox(
            [
                self.out,
                self.slider,
                button
            ],
            layout=box_layout
        )

        

        with self.out:    
            self.plot_data(default_slider_value)
        button.on_click(self.clicked)
        display(box)
    
plotter = Plotter(title="Memorization Metric",
                  xlabel='Index',ylabel='NLL Loss',
                  x=indicies,y=data)
plotter.run()

VBox(children=(Output(), IntSlider(value=12, description='Scale', layout=Layout(width='50%'), max=21, min=1), …

# Correlation

In [8]:
from scipy import signal

correlation = signal.correlate(indicies, data, mode="full")
plotter = Plotter(xlabel='indicies',ylabel='correlation',
                  title='Correlation',x=indicies,y=correlation,default_slider_value=11)
plotter.run()

VBox(children=(Output(), IntSlider(value=11, description='Scale', layout=Layout(width='50%'), max=22, min=1), …

# Statistics

In [11]:
import matplotlib.pyplot as plt

SAMPLE_VALUE = len(data)*25//100
from sklearn.metrics import r2_score
r2 = r2_score(indicies,data)
print(f"R2 Score between indicies and data: {r2:.5f}")
avg_start = data[:SAMPLE_VALUE].mean()
avg_end = data[SAMPLE_VALUE:].mean()

var_start = data[:SAMPLE_VALUE].var()
var_end = data[SAMPLE_VALUE:].var()
print(f"Average NLL Loss changed from {avg_start:.5f} to {avg_end:.5f}")
print(f"Varience of  NLL Loss changed from {var_start:.5f} to {var_end:.5f}")
print("Trend of very slight improvement continues")

R2 Score between indicies and data: -3.00022
Average NLL Loss changed from -10.01421 to -10.00763
Varience of  NLL Loss changed from 35.04649 to 34.87698
Trend of very slight improvement continues
