# Fig 01, 02, 10, 11

In [None]:
from data_import import *
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from plotting import *
device = 'cuda:0'
from tqdm import tqdm

def load_result_table_best(project="feeds/v4_dvsT"):
    # Project is specified by <entity/project-name>
    runs = api.runs(project)

    summary_list = []
    for run in tqdm(runs): 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files 
        try:
            history = run.history(keys=['val_acc'])
            best = history['val_acc'].max()
        except:
            best = 0
        
        
        res = {**run.summary._json_dict,
            **{k: v for k,v in run.config.items()
            },'name':run.name,'entity': run.entity, 'project': run.project, 'best_val_acc': best}
        #config_cols = list(run.config.keys())
        summary_list.append(res)

    df = pd.DataFrame(summary_list)
    return df.set_index('name') #, config_cols

This will take for ever because the data loading is inefficient, but we only do it once...

In [None]:
df_orig = load_result_table_best('feeds/phase_diagram_T32').reset_index()
df_orig = df_orig[df_orig['epoch']==499]

In [None]:
figure_dir = FIGURE_DIR

In [None]:
allowed_p = [1,2,4,8,16,32,64,128,3, 6, 11, 23, 45, 91]
allowed_model_dim = [1,2,4,8,16,32,64,128,3, 6, 11, 23, 45, 91]

df_orig = df_orig[df_orig['p'].isin(allowed_p) & df_orig['model_dim'].isin(allowed_model_dim)]
df_orig.model_dim.unique(), df_orig.p.unique()

In [None]:
df = df_orig.groupby(CONFIG_COLS).mean(numeric_only=True).reset_index()
df_std = df_orig.groupby(CONFIG_COLS).std(numeric_only=True).reset_index()
df_max = df_orig.groupby(CONFIG_COLS).max(numeric_only=True).reset_index()

In [None]:
import math

def bound1(T,L):
    # for lin, lin_sftm
    return math.ceil(T*(2*L-3)**2/(T-1+(2*L-3)**2))

def bound2(T,L):
    # for dot, bos, p=1
    return bound1(T,L) + 1

def bound3(T,L):
    # for dot, bos, p=T
    return math.ceil(T*(2*L-3)/(T-1+(2*L-3))) #+ 1

In [None]:
T = 32
L = 10
k = df[(df['T'] == T) & (df.seq_len == L) ]
k_std = df_std[(df_std['T'] == T) & (df_std.seq_len == L) ]
k_max = df_max[(df_std['T'] == T) & (df_std.seq_len == L) ]

fig, axes = plt.subplots(2,3,figsize=(12,6),sharex=True,sharey=True)

axes_assignment = {
        'linear': (1,2),
        'linear+sftm': (0,2),
        'dot': (1,1),
        'dot+sftm': (0,1),
        'dotBOS' : (1,0),
        'dotBOS+sftm': (0,0),
}

axes[axes_assignment['linear+sftm']].set_title('[lin]\nlinear mixing',fontsize=9)
axes[axes_assignment['dot+sftm']].set_title('[dot]\ndot-product attention',fontsize=9)
axes[axes_assignment['dotBOS+sftm']].set_title('[bos]\ndot-product attention & BOS token',fontsize=9)

axes[1,0].set_ylabel('no softmax',fontsize=9)
axes[0,0].set_ylabel('with softmax',fontsize=9)
axes[1,1].set_xlabel('hidden layer size $p$',fontsize=12)


for name, model_config in MODELS.items():
    ax = axes[axes_assignment[name]]
    a = k[(k.attention_input == model_config['attention_input'])
            & (k.no_softmax == model_config['no_softmax']) 
            & (k.dataset_type == model_config['dataset'])]
    a_std = k_std[(k_std.attention_input == model_config['attention_input'])
            & (k_std.no_softmax == model_config['no_softmax']) 
            & (k_std.dataset_type == model_config['dataset'])]
    a_max = k_max[(k_max.attention_input == model_config['attention_input'])
                & (k_max.no_softmax == model_config['no_softmax']) 
                & (k_max.dataset_type == model_config['dataset'])]
    
    if len(a) == 0:
        continue

    aother_mean = pd.pivot_table(a, index='p', columns='model_dim', values='val_acc')
    aother_max = pd.pivot_table(a_max, index='p', columns='model_dim', values='best_val_acc')
    
    
    smap = ax.pcolor(aother_mean.index+1, aother_mean.columns+1, aother_mean.values.T, cmap='Spectral', vmin=0, vmax=100)
    import matplotlib.patches as patches
    x = aother_max.index
    y = aother_max.columns
    Z = aother_max.values.T
    for i in range(len(x)-1):
        for j in range(len(y)-1):
                if Z[j, i] >= 100.0:
                        # Get the coordinates of the lower-left corner of the cell
                        #rect = patches.Rectangle((x[i], y[j]), x[i+1]-x[i], y[j+1]-y[j],
                        #                        linewidth=2, edgecolor='red', facecolor='none')
                        #ax.add_patch(rect)
                        ax.scatter(x[i+1]+1,y[j+1]+1,s=20,marker='*',c='white')
                elif Z[j, i] >= 99.0:
                        # Get the coordinates of the lower-left corner of the cell
                        #rect = patches.Rectangle((x[i], y[j]), x[i+1]-x[i], y[j+1]-y[j],
                        #                        linewidth=2, edgecolor='red', facecolor='none')
                        #ax.add_patch(rect)
                        ax.scatter(x[i+1]+1,y[j+1]+1,s=10,marker='.',c='white')
                      
    
    #smap = ax.pcolor(aother2.index, aother2.columns, aother2.values.T, cmap='binary_r', vmin=0, vmax=1,alpha=0.5,edgecolors='black',linewidth=1)
    #smap = ax.scatter(a.p,a.model_dim,  c=a.val_acc,label=name, marker='o', lw=1,vmin=0,vmax=100,cmap='Spectral',s=200)
    
    ax.axvline(T,c='white')
    ax.axhline(T,c='white')
    #ax.axhline(np.log(T)+1,c='white')
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    if 'linear' in name:
        ax2 = ax.twinx()
        ax2.set_yscale('log')
        ax2.set_ylim(ax.get_ylim())
        ax2.set_yticks([T])
        ax2.set_yticklabels(['T'])
fig.text(0.06, 0.5, 'embedding dimension $d$', va='center', rotation='vertical',fontsize=12)
#smap = plt.cm.ScalarMappable(cmap='Spectral', vmin=0, vmax=100)
fig.colorbar(smap,ax=axes,location='right',shrink=0.5,label='mean accuracy [%]')
plt.savefig(figure_dir / 'memorizing_counts-phase-diagram-T32-mean-best-marked.png',bbox_inches='tight')

"""
axes[axes_assignment['linear+sftm']].axhline(bound1(T,L),c='white',ls='--',label='bound 1')
axes[axes_assignment['linear']].axhline(bound1(T,L),c='white',ls='--',label='bound 1')
axes[axes_assignment['dot']].axhline(bound3(T,L),c='white',ls='--',label='bound 1')
axes[axes_assignment['dot+sftm']].axhline(bound3(T,L),c='white',ls='--',label='bound 1')
axes[axes_assignment['dotBOS']].axhline(bound3(T,L),c='white',ls='--',label='bound 1')
axes[axes_assignment['dotBOS']].axhline(bound2(T,L),c='white',ls='--',label='bound 1')
axes[axes_assignment['dotBOS+sftm']].axhline(2,c='white',ls='--',label='bound 1')
"""

plt.show()

In [None]:
T = 32
L = 10
k = df[(df['T'] == T) & (df.seq_len == L) ]
k_std = df_std[(df_std['T'] == T) & (df_std.seq_len == L) ]

fig, axes = plt.subplots(2,3,figsize=(12,6),sharex=True,sharey=True)

axes_assignment = {
        'linear': (1,2),
        'linear+sftm': (0,2),
        'dot': (1,1),
        'dot+sftm': (0,1),
        'dotBOS' : (1,0),
        'dotBOS+sftm': (0,0),
}

axes[axes_assignment['linear+sftm']].set_title('[lin]\nlinear mixing',fontsize=9)
axes[axes_assignment['dot+sftm']].set_title('[dot]\ndot-product attention',fontsize=9)
axes[axes_assignment['dotBOS+sftm']].set_title('[bos]\ndot-product attention & BOS token',fontsize=9)

axes[1,0].set_ylabel('no softmax',fontsize=9)
axes[0,0].set_ylabel('with softmax',fontsize=9)
axes[1,1].set_xlabel('hidden layer size $p$',fontsize=12)


for name, model_config in MODELS.items():
    ax = axes[axes_assignment[name]]
    a = k[(k.attention_input == model_config['attention_input'])
            & (k.no_softmax == model_config['no_softmax']) 
            & (k.dataset_type == model_config['dataset'])]
    a_std = k_std[(k_std.attention_input == model_config['attention_input'])
            & (k_std.no_softmax == model_config['no_softmax']) 
            & (k_std.dataset_type == model_config['dataset'])]
    if len(a) == 0:
        continue

    aother = pd.pivot_table(a, index='p', columns='model_dim', values='val_acc')
    smap = ax.pcolor(aother.index+1, aother.columns+1, aother.values.T, cmap='Spectral', vmin=0, vmax=100)
    #smap = ax.scatter(a.p,a.model_dim,  c=a.val_acc,label=name, marker='o', lw=1,vmin=0,vmax=100,cmap='Spectral',s=200)
    
    ax.axvline(T,c='white')
    ax.axhline(T,c='white')
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    if 'linear' in name:
        ax2 = ax.twinx()
        ax2.set_yscale('log')
        ax2.set_ylim(ax.get_ylim())
        ax2.set_yticks([T])
        ax2.set_yticklabels(['T'])
fig.text(0.06, 0.5, 'embedding dimension $d$', va='center', rotation='vertical',fontsize=12)
#smap = plt.cm.ScalarMappable(cmap='Spectral', vmin=0, vmax=100)
fig.colorbar(smap,ax=axes,location='right',shrink=0.5,label='mean accuracy [%]')
plt.savefig(figure_dir / 'memorizing_counts-phase-diagram-T32-avg.png',bbox_inches='tight')
plt.show()

In [None]:
T = 32
L = 10
k = df[(df['T'] == T) & (df.seq_len == L) ]
k_std = df_std[(df_std['T'] == T) & (df_std.seq_len == L) ]
k_max = df_max[(df_std['T'] == T) & (df_std.seq_len == L) ]

fig, axes = plt.subplots(2,3,figsize=(12,6),sharex=True,sharey=True)

axes_assignment = {
        'linear': (1,2),
        'linear+sftm': (0,2),
        'dot': (1,1),
        'dot+sftm': (0,1),
        'dotBOS' : (1,0),
        'dotBOS+sftm': (0,0),
}

axes[axes_assignment['linear+sftm']].set_title('[lin]\nlinear mixing',fontsize=9)
axes[axes_assignment['dot+sftm']].set_title('[dot]\ndot-product attention',fontsize=9)
axes[axes_assignment['dotBOS+sftm']].set_title('[bos]\ndot-product attention & BOS token',fontsize=9)

axes[1,0].set_ylabel('no softmax',fontsize=9)
axes[0,0].set_ylabel('with softmax',fontsize=9)
axes[1,1].set_xlabel('hidden layer size $p$',fontsize=12)


for name, model_config in MODELS.items():
    ax = axes[axes_assignment[name]]
    a = k[(k.attention_input == model_config['attention_input'])
            & (k.no_softmax == model_config['no_softmax']) 
            & (k.dataset_type == model_config['dataset'])]
    a_std = k_std[(k_std.attention_input == model_config['attention_input'])
            & (k_std.no_softmax == model_config['no_softmax']) 
            & (k_std.dataset_type == model_config['dataset'])]
    a_max = k_max[(k_max.attention_input == model_config['attention_input'])
                & (k_max.no_softmax == model_config['no_softmax']) 
                & (k_max.dataset_type == model_config['dataset'])]
    
    if len(a) == 0:
        continue

    aother_mean = pd.pivot_table(a, index='p', columns='model_dim', values='val_acc')
    aother_max = pd.pivot_table(a_max, index='p', columns='model_dim', values='best_val_acc')
    
    
    smap = ax.pcolor(aother_max.index+1, aother_max.columns+1, aother_max.values.T, cmap='Spectral', vmin=0, vmax=100)
    
    #smap = ax.pcolor(aother2.index, aother2.columns, aother2.values.T, cmap='binary_r', vmin=0, vmax=1,alpha=0.5,edgecolors='black',linewidth=1)
    #smap = ax.scatter(a.p,a.model_dim,  c=a.val_acc,label=name, marker='o', lw=1,vmin=0,vmax=100,cmap='Spectral',s=200)
    
    ax.axvline(T,c='white')
    ax.axhline(T,c='white')
    #ax.axhline(np.log(T)+1,c='white')
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    if 'linear' in name:
        ax2 = ax.twinx()
        ax2.set_yscale('log')
        ax2.set_ylim(ax.get_ylim())
        ax2.set_yticks([T])
        ax2.set_yticklabels(['T'])
fig.text(0.06, 0.5, 'embedding dimension $d$', va='center', rotation='vertical',fontsize=12)
#smap = plt.cm.ScalarMappable(cmap='Spectral', vmin=0, vmax=100)
fig.colorbar(smap,ax=axes,location='right',shrink=0.5,label='best out of 5 accuracy [%]')
plt.savefig(figure_dir / 'memorizing_counts-phase-diagram-T32-best.png',bbox_inches='tight')
plt.show()

In [None]:
T = 32
L = 10
k = df[(df['T'] == T) & (df.seq_len == L) ]
k_std = df_std[(df_std['T'] == T) & (df_std.seq_len == L) ]

fig, axes = plt.subplots(2,3,figsize=(12,6),sharex=True,sharey=True)

axes_assignment = {
        'linear': (1,2),
        'linear+sftm': (0,2),
        'dot': (1,1),
        'dot+sftm': (0,1),
        'dotBOS' : (1,0),
        'dotBOS+sftm': (0,0),
}

axes[axes_assignment['linear+sftm']].set_title('[lin]\nlinear mixing',fontsize=9)
axes[axes_assignment['dot+sftm']].set_title('[dot]\ndot-product attention',fontsize=9)
axes[axes_assignment['dotBOS+sftm']].set_title('[bos]\ndot-product attention & BOS token',fontsize=9)

axes[1,0].set_ylabel('no softmax',fontsize=9)
axes[0,0].set_ylabel('with softmax',fontsize=9)
axes[1,1].set_xlabel('hidden layer size $p$',fontsize=12)


for name, model_config in MODELS.items():
    ax = axes[axes_assignment[name]]
    a = k[(k.attention_input == model_config['attention_input'])
            & (k.no_softmax == model_config['no_softmax']) 
            & (k.dataset_type == model_config['dataset'])]
    a_std = k_std[(k_std.attention_input == model_config['attention_input'])
            & (k_std.no_softmax == model_config['no_softmax']) 
            & (k_std.dataset_type == model_config['dataset'])]
    if len(a) == 0:
        continue

    aother = pd.pivot_table(a_std, index='p', columns='model_dim', values='val_acc')
    smap = ax.pcolor(aother.index, aother.columns, aother.values.T, cmap='Oranges', vmin=0,vmax=40)
    #smap = ax.scatter(a.p,a.model_dim,  c=a.val_acc,label=name, marker='o', lw=1,vmin=0,vmax=100,cmap='Spectral',s=200)
    
    ax.axvline(T,c='white')
    ax.axhline(T,c='white')
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    if 'linear' in name:
        ax2 = ax.twinx()
        ax2.set_yscale('log')
        ax2.set_ylim(ax.get_ylim())
        ax2.set_yticks([T])
        ax2.set_yticklabels(['T'])
fig.text(0.06, 0.5, 'embedding dimension $d$', va='center', rotation='vertical',fontsize=12)
#smap = plt.cm.ScalarMappable(cmap='Spectral', vmin=0, vmax=100)
fig.colorbar(smap,ax=axes,location='right',shrink=0.5,label='std. of accuracy over 5 runs')
plt.savefig(figure_dir / 'memorizing_counts-phase-diagram-T32-std.png',bbox_inches='tight')
plt.show()

In [None]:
T = 32
L = 10
k = df_orig[(df_orig['T'] == T) & (df_orig.seq_len == L) ]

fig, axes = plt.subplots(1,2,figsize=(7,3),sharex=True,sharey=True)

ax = plt.gca()


sftm_marker = '*'
other_marker = '.'
lin_col = 'blue'
dot_col = 'red'
bos_col = 'orange'
for name, model_config in MODELS.items():
    a = k[(k.attention_input == model_config['attention_input'])
            & (k.no_softmax == model_config['no_softmax']) 
            & (k.dataset_type == model_config['dataset'])]
    a_std = k_std[(k_std.attention_input == model_config['attention_input'])
            & (k_std.no_softmax == model_config['no_softmax']) 
            & (k_std.dataset_type == model_config['dataset'])]
    print(name)
    if 'linear' in name:
        color = lin_col#$l$'
    if 'dot' in name:
        color = dot_col
    if 'BOS' in name:
        color = bos_col
    
    ax = axes[0] if '+' in name else axes[1]
    smap = ax.scatter(a.params_count,a.val_acc,color=color,alpha=0.1,s=10, )
    b = a.sort_values('params_count')
    # get the running maximum
    b['val_acc_max'] = b.val_acc.cummax()
    ax.plot(b.params_count,b.val_acc_max,color=color,lw=2)
    
ax.set_xscale('log')
ax.plot([],[],color='black',lw=2,label='running best')
plt.scatter([],[],marker="s",label='lin',color=lin_col)
plt.scatter([],[],marker="s",label='dot',color=dot_col)
plt.scatter([],[],marker="s",label='bos',color=bos_col)
plt.legend()
axes[0].set_xlabel('number of parameters')
axes[0].set_title('with softmax')
axes[1].set_title('no softmax')
axes[1].set_xlabel('number of parameters')
axes[0].set_ylabel('accuracy [%]')
fig.suptitle('T=32')
ax.set_ylim(15,105)
plt.savefig(figure_dir / 'memorizing_counts-params-T32.png',bbox_inches='tight')