# Imports

In [1]:
from spider_net.net import *
from spider_net.trainers import *
from spider_net.data_loaders import *
from spider_net.helpers import *
import numpy as np
import pandas as pd
import random
import copy
import spider_net.lrc as lrc
import spider_net.ntk as ntk
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
plt.style.use('material')
np.set_printoptions(suppress=True)
torch.set_printoptions(sci_mode=False)

# Helper Funcs

In [2]:
def kaiming_normal_fanin_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
        if hasattr(m, 'bias') and m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.ones_(m.weight.data)
        nn.init.constant_(m.bias.data, 0.0)


def kaiming_normal_fanout_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if hasattr(m, 'bias') and m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.ones_(m.weight.data)
        nn.init.constant_(m.bias.data, 0.0)


def init_model(model, method='kaiming_norm_fanin'):
    if method == 'kaiming_norm_fanin':
        model.apply(kaiming_normal_fanin_init)
    elif method == 'kaiming_norm_fanout':
        model.apply(kaiming_normal_fanout_init)
        
class bcolors:
    GREEN = '\033[92m'
    RED = '\033[91m'
    ENDC = '\033[0m'
    
def red(x):
    return bcolors.RED + x + bcolors.ENDC
    
def green(x):
    return bcolors.GREEN + x + bcolors.ENDC

# Configuration

In [3]:
hypers = {
    'gpu_space':20,
    'dataset':{'name':'CIFAR10', 'classes':10},
    'batch_size':64,
    'scale': 64,
    'reductions':2,
    'lr_schedule': {'lr_max': .01, 'T': 4},
    'drop_prob':.2,
    'prune_interval':4,
    'prune':True,
    'mut_thresh':.5,
    'n_mutations':3,
    'device':'cuda'
}

data, input_dim = load_data(hypers['batch_size'], hypers['dataset']['name'])
hypers['input_dim'] = input_dim
hypers['mutate'] = True

# Train-Free Metrics

In [4]:
sample_batches = 3
n_ntk_batches = 3
n_repeats = 3

def compare_models(modelA, modelB, modelA_thin, modelB_thin, data, metrics, n_repeats, origB=False):
    LRC = lrc.Linear_Region_Collector([modelA, modelB], input_size=[1000,1,3,3], sample_batch=sample_batches)
    modelA.cuda()
    modelB.cuda()
    ntks, lrcs = [], []
           
    if 'ntk' in metrics:
        for _ in range(n_repeats):     
            init_model(modelA, 'kaiming_norm_fanout')
            for param_ori, param in zip(modelA.parameters(), modelB.parameters()):
                param.data.copy_(param_ori.data)
            ntkA, ntkB = ntk.get_ntk_n(data, [modelA, modelB], train_mode=True, recalbn=0, num_batch=n_ntk_batches)
            norm = (ntkA-ntkB)/ntkB
            ntks.append(norm)

    modelA.zero_grad()
    modelB.zero_grad()
    ######################
    modelA_thin.cuda()
    modelB_thin.cuda()
    modelA_thin.train()
    modelB_thin.train()
    if 'lrc' in metrics:
        for _ in range(n_repeats):
            with torch.no_grad():
                init_model(modelA_thin, 'kaiming_norm_fanin')
                for param_ori, param in zip(modelA_thin.parameters(), modelB_thin.parameters()):
                    param.data.copy_(param_ori.data)
                LRC.reinit([modelA_thin, modelB_thin])
                lrcA, lrcB = LRC.forward_batch_sample()
                norm = (lrcA-lrcB)/float(lrcB)
                lrcs.append(norm)
                LRC.clear()
                

    modelA.cpu()
    modelB.cpu()
    out = []
    
    if 'lrc' in metrics:
        out.append(np.mean(lrcs))
    else:
        out.append(0)
    if 'ntk' in metrics:
        out.append(np.mean(ntks))
    else:
        out.append(0)
    return out

def evaluate_score(score, model, verbose=False):
    n_edges = model.get_n_edges()
    names = list(model.get_growth_factors().keys())
    
    ntks = score[:,1]
    lrcs = score[:,0]
    
    ntk_max, ntk_min = np.ma.masked_invalid(ntks).max(), np.ma.masked_invalid(ntks).min(), 
    lrc_max, lrc_min = np.ma.masked_invalid(lrcs).max(), np.ma.masked_invalid(lrcs).min(), 
    
    if lrc_max == lrc_min:
        lrc_score = 1 - lrcs/lrc_max
    else:
        lrc_score = 1- (lrcs-lrc_min)/(lrc_max-lrc_min)
        
    if ntk_max == ntk_min:
        ntk_score = ntks-ntk_min
    else:
        ntk_score = (ntks-ntk_min)/(ntk_max-ntk_min)                            
    #lrc_rank = rank(lrc_delta, flip=True)
    #ntk_rank = rank(ntk_delta)
    joined_score = lrc_score + ntk_score
    joined_score[np.where(lrcs==-np.inf)] = 2.
    norm_score = joined_score/2
    norm_rank = rank(norm_score)
    norm_rank[np.where(lrcs==-np.inf)] = max(norm_rank)
    
    if verbose:
        for i in range(len(ntks)):
            l = lrcs[i]
            n = ntks[i]
            if l == -np.inf:
                continue
            lrc_fmt = "{:>7.4f}".format(l)
            ntk_fmt = "{:>7.4f}".format(n)
            name_fmt = "{} {}".format(names[i][0], names[i][1])
            print("{:>15} {} {} {:>7.2f} {:>7.2f} {:>7.2f} {:>7.2f} {:>3d}".format(
                name_fmt, 
                red(lrc_fmt) if l<=0 else green(lrc_fmt),
                red(ntk_fmt) if n>0 else green(ntk_fmt),
                lrc_score[i],
                ntk_score[i], 
                joined_score[i], 
                norm_score[i],
                norm_rank[i]))
    
    
    norm_score = norm_score.reshape(n_edges, -1)
    return norm_score

# Mutation Heuristics

In [5]:
def find_best_mutation(model, target_cell, n_candidates):    
    scores = []
    null_score = [-np.inf, np.inf]
    muts = list(model.get_growth_factors().keys())
    muts_idxs = [(i,x) for i,x in enumerate(muts)]
    random.shuffle(muts_idxs)
    scores = [None] * len(muts)
    num_candidates_found = 0
    
    print("\r\tEvaluating {:>2} mutations |".format(len(muts)), end="")
    for idx, (cell, edge) in muts_idxs:
        model_on = copy.deepcopy(model)
        model_on.data = data
    
        if target_cell != cell or num_candidates_found >= n_candidates:
            scores[idx] = null_score
            continue
        new_edges = model_on.cells[int(cell)].split_edge(edge, model_on.device, model_on.data_index, verbose=False)
        new_edges = [(cell, k) for k in new_edges]
        model_on.update_edge_toggles()
        model_off = copy.deepcopy(model_on)
        activation = model_on.edge_toggles.copy()
        for i, k in enumerate(model_off.get_growth_factors().keys()):
            if k==new_edges[0] or k==new_edges[1]:
                activation[i] = 0
        model_off.edge_toggles = activation    
    
        model_off_thin = copy.deepcopy(model_off)
        model_on_thin = copy.deepcopy(model_on)
        
        model_off.upsize(14,3)
        model_on.upsize(14,3)
        model_off_thin.upsize(14,1)
        model_on_thin.upsize(14,1)
        
        score = compare_models(model_on, model_off, model_on_thin, model_off_thin, data[0], n_repeats=n_repeats, metrics=['lrc','ntk'])
        if score[0] <= 0 or score[1] > 0:
            print("{}|".format(red(str(idx))),end="")
            scores[idx] = null_score
        else:
            print("{}|".format(green(str(idx))),end="")
            scores[idx] = score
            num_candidates_found += 1
        del model_on, model_off
    return np.array(scores).squeeze(), muts

def perform_best_mutation(model, test_order, best):
    edge = test_order[best][:2]
    cell, edge_idx = edge
    if model.check_mutation(int(cell), edge_idx):
        print(' Perfoming best found mutation {}'.format(edge))
        model.cells[int(cell)].split_edge(edge_idx, model.device, model.data_index, verbose=False)
        model.update_edge_toggles()
        return model, True
    else:
        print(" Best mutation could not fit in memory.")
        return model, False


def mutate(model, n_candidates=3):
    n_muts = 0
    size_constrained = False
    model.cpu()
    for cell in range(len(model.cells)):
        print("== Finding Cell {} Muts ==".format(cell))
        mut_scores, test_order = find_best_mutation(model, str(cell), n_candidates)
        if np.isinf(mut_scores).all():
            print(" Found no good mutations.")
            continue
        evaluated = evaluate_score(mut_scores, model, verbose=False)
        best = np.argmin(evaluated)
        model, success = perform_best_mutation(model, test_order, best)
        if success:
            n_muts += 1
        else:
            size_constrained = True
    return model, n_muts, size_constrained

# Start

In [6]:
model = Net(hypers)
model.data = load_data(64, hypers['dataset']['name'])[0]
mutate(model)

== Finding Cell 0 Muts ==
	Evaluating  9 mutations |[92m0[0m|[92m1[0m| Perfoming best found mutation ('0', '0->1')
== Finding Cell 1 Muts ==
	Evaluating 11 mutations |[91m6[0m|[91m5[0m|[91m4[0m| Found no good mutations.
== Finding Cell 2 Muts ==
	Evaluating 11 mutations |[91m10[0m|[91m8[0m|[91m9[0m|[91m7[0m| Found no good mutations.


(Net(
   (initializer): SingleConv(
     (op): Sequential(
       (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
       (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     )
   )
   (scalers): ModuleDict()
   (towers): ModuleDict(
     (0): Classifier(
       (op): Sequential(
         (0): AdaptiveAvgPool2d(output_size=1)
         (1): NNView()
         (2): Linear(in_features=64, out_features=10, bias=True)
       )
     )
     (1): Classifier(
       (op): Sequential(
         (0): AdaptiveAvgPool2d(output_size=1)
         (1): NNView()
         (2): Linear(in_features=128, out_features=10, bias=True)
       )
     )
     (2): Classifier(
       (op): Sequential(
         (0): AdaptiveAvgPool2d(output_size=1)
         (1): NNView()
         (2): Linear(in_features=256, out_features=10, bias=True)
       )
     )
   )
   (cells): ModuleList(
     (0): Cell 0 : D:  64 x 32  P:128028
     (1): Cell 1 : D: 128 x 16

## Search

In [7]:
t_start = time.time()
for i in range(30):
    print("=============== Iteration {} =============".format(i))
    full_train(model, hypers)
    mutate(model)
    model.reset_parameters()
print("Search time:", time.time()-t_start)

=== Training Grecian Hrothgar Harlan ===
Starting at 2021-12-09 21:25:49.305018
12/09/2021 09:25 PM
Train Corrects: Top-1: 35.29%, 1m,33s
Last Tower Test  Corrects: Top-1: 44.82%, 4.39s
Deadheaded 0 operations
Deadheaded []
Param Delta: 2,250,283 -> 2,250,283

[31mAdjusting lrs to [0.008535533905932738][0m
12/09/2021 09:27 PM
Train Corrects: Top-1: 47.38%, 1m,32s
Last Tower Test  Corrects: Top-1: 57.72%, 4.09s
Deadheaded 0 operations
Deadheaded []
Param Delta: 2,250,283 -> 2,250,283

[31mAdjusting lrs to [0.005][0m
12/09/2021 09:29 PM
Train Corrects: Top-1: 54.07%, 1m,27s
Last Tower Test  Corrects: Top-1: 63.44%, 4.09s
Deadheaded 0 operations
Deadheaded []
Param Delta: 2,250,283 -> 2,250,283

[31mAdjusting lrs to [0.0014644660940672626][0m
12/09/2021 09:30 PM
Train Corrects: Top-1: 58.42%, 1m,27s
Last Tower Test  Corrects: Top-1: 64.50%, 4.09s
Deadheaded 8 operations
Deadheaded [[0, '0->1', 'Avg_Pool_3x3'], [0, '0->1', 'Dil_Conv_3x3'], [0, '0->1', 'Dil_Conv_5x5'], [0, '0->2', 'Av


[31mAdjusting lrs to [0.008535533905932738][0m
12/09/2021 10:27 PM
Train Corrects: Top-1: 49.43%, 2m,23s
Last Tower Test  Corrects: Top-1: 57.81%, 6.49s
Deadheaded 0 operations
Deadheaded []
Param Delta: 3,361,073 -> 3,361,073

[31mAdjusting lrs to [0.005][0m
12/09/2021 10:30 PM
Train Corrects: Top-1: 55.33%, 2m,22s
Last Tower Test  Corrects: Top-1: 62.85%, 6.49s
Deadheaded 0 operations
Deadheaded []
Param Delta: 3,361,073 -> 3,361,073

[31mAdjusting lrs to [0.0014644660940672626][0m
12/09/2021 10:32 PM
Train Corrects: Top-1: 59.59%, 2m,23s
Last Tower Test  Corrects: Top-1: 66.37%, 6.49s
Deadheaded 9 operations
Deadheaded [[0, '0->2', 'Avg_Pool_3x3'], [0, '0->2', 'Dil_Conv_3x3'], [0, '0->2', 'Dil_Conv_5x5'], [0, '2->3', 'Dil_Conv_3x3'], [0, '2->3', 'Dil_Conv_5x5'], [0, '3->4', 'Dil_Conv_3x3'], [0, '3->5', 'Dil_Conv_3x3'], [0, '0->1', 'Identity'], [2, '0->4', 'Avg_Pool_3x3']]
Param Delta: 3,361,073 -> 3,330,216

[31mAdjusting lrs to [0.0][0m
== Finding Cell 0 Muts ==
	Evaluatin

	Evaluating 49 mutations |[91m40[0m|[91m42[0m|[91m39[0m|[91m46[0m|[91m48[0m|[91m44[0m|[91m43[0m|[91m47[0m|[91m41[0m|[91m45[0m| Found no good mutations.
=== Training Grecian Hrothgar Harlan ===
Starting at 2021-12-10 04:08:20.489358
12/10/2021 04:08 AM
Train Corrects: Top-1: 38.40%, 5m,13s
Last Tower Test  Corrects: Top-1: 49.14%, 13.42s
Deadheaded 0 operations
Deadheaded []
Param Delta: 6,904,245 -> 6,904,245

[31mAdjusting lrs to [0.008535533905932738][0m
12/10/2021 04:13 AM
Train Corrects: Top-1: 50.73%, 5m,12s
Last Tower Test  Corrects: Top-1: 59.90%, 13.41s
Deadheaded 0 operations
Deadheaded []
Param Delta: 6,904,245 -> 6,904,245

[31mAdjusting lrs to [0.005][0m
12/10/2021 04:19 AM
Train Corrects: Top-1: 57.44%, 5m,12s
Last Tower Test  Corrects: Top-1: 65.29%, 13.41s
Deadheaded 0 operations
Deadheaded []
Param Delta: 6,904,245 -> 6,904,245

[31mAdjusting lrs to [0.0014644660940672626][0m
12/10/2021 04:24 AM
Train Corrects: Top-1: 60.78%, 5m,13s
Last Tower 


[31mAdjusting lrs to [0.0][0m
== Finding Cell 0 Muts ==
	Evaluating 57 mutations |[91m10[0m|[91m5[0m|[91m15[0m|[91m16[0m|[91m8[0m|[91m0[0m|[91m11[0m|[91m9[0m|[91m17[0m|[91m7[0m|[91m19[0m|[91m18[0m|[91m12[0m|[91m3[0m|[91m6[0m|[91m1[0m|[91m13[0m|[91m2[0m|[91m4[0m|[91m14[0m| Found no good mutations.
== Finding Cell 1 Muts ==
	Evaluating 57 mutations |[91m45[0m|[91m26[0m|[92m36[0m|[92m21[0m|[92m33[0m| Perfoming best found mutation ('1', '14->15')
== Finding Cell 2 Muts ==
	Evaluating 59 mutations |[92m56[0m|[92m58[0m|[92m53[0m| Perfoming best found mutation ('2', '6->7')
=== Training Grecian Hrothgar Harlan ===
Starting at 2021-12-10 09:14:22.323228
12/10/2021 09:14 AM
Train Corrects: Top-1: 38.22%, 6m,12s
Last Tower Test  Corrects: Top-1: 50.67%, 15.72s
Deadheaded 0 operations
Deadheaded []
Param Delta: 8,584,905 -> 8,584,905

[31mAdjusting lrs to [0.008535533905932738][0m
12/10/2021 09:20 AM
Train Corrects: Top-1: 50.82%, 6m,13


[31mAdjusting lrs to [0.005][0m
12/10/2021 02:51 PM
Train Corrects: Top-1: 57.89%, 7m,16s
Last Tower Test  Corrects: Top-1: 66.43%, 18.26s
Deadheaded 0 operations
Deadheaded []
Param Delta: 10,935,133 -> 10,935,133

[31mAdjusting lrs to [0.0014644660940672626][0m
12/10/2021 02:58 PM
Train Corrects: Top-1: 61.91%, 7m,16s
Last Tower Test  Corrects: Top-1: 67.16%, 18.31s
Deadheaded 6 operations
Deadheaded [[0, '11->12', 'Dil_Conv_5x5'], [0, '6->7', 'Sep_Conv_5x5'], [0, '8->10', 'Dil_Conv_5x5'], [2, '4->5', 'Avg_Pool_3x3'], [2, '7->8', 'Identity'], [2, '7->8', 'Dil_Conv_3x3']]
Param Delta: 10,935,133 -> 10,843,479

[31mAdjusting lrs to [0.0][0m
== Finding Cell 0 Muts ==
	Evaluating 73 mutations |[91m14[0m|[91m8[0m|[91m19[0m|[91m2[0m|[91m20[0m|[91m21[0m|[91m0[0m|[92m6[0m|[91m9[0m|[92m5[0m|[92m7[0m| Perfoming best found mutation ('0', '6->7')
== Finding Cell 1 Muts ==
	Evaluating 75 mutations |[92m25[0m|[91m41[0m|[92m44[0m|[92m46[0m| Perfoming best found 


[31mAdjusting lrs to [0.0][0m
== Finding Cell 0 Muts ==
	Evaluating 99 mutations |[91m19[0m|[91m14[0m|[91m12[0m|[91m24[0m|[92m4[0m|[91m9[0m|[91m27[0m|[91m6[0m|[91m17[0m|[91m25[0m|[92m7[0m|[91m15[0m|[91m11[0m|[91m13[0m|[91m21[0m|[92m23[0m| Best mutation could not fit in memory.
== Finding Cell 1 Muts ==
	Evaluating 99 mutations |[91m50[0m|[91m67[0m|[92m35[0m|[91m31[0m|[91m37[0m|[92m29[0m|[91m52[0m|[92m33[0m| Perfoming best found mutation ('1', '12->17')
== Finding Cell 2 Muts ==
	Evaluating 101 mutations |[92m78[0m|[92m95[0m|[92m82[0m| Best mutation could not fit in memory.
=== Training Grecian Hrothgar Harlan ===
Starting at 2021-12-11 06:49:51.831496
12/11/2021 06:49 AM
Train Corrects: Top-1: 38.41%, 9m,20s
Last Tower Test  Corrects: Top-1: 52.45%, 24.06s
Deadheaded 0 operations
Deadheaded []
Param Delta: 15,392,481 -> 15,392,481

[31mAdjusting lrs to [0.008535533905932738][0m
12/11/2021 06:59 AM
Train Corrects: Top-1: 51.28%,

KeyboardInterrupt: 

In [8]:
print(model)

Epoch 4              :     Dim      :    Params    :   Ops:   
Initializer          :              :    1,856     :          
Cell 0               :   64 x 32    :   595,455    :    196   
 ↳ Aux Tower         :              :     650      :          
Cell 1               :  128 x 16    :  5,046,998   :    343   
 ↳ Aux Tower         :              :    1,290     :          
Cell 2               :  256 x 8     :  9,734,054   :    168   
 ↳ Classifier        :              :    2,570     :          
Total                :              :  15,382,873  :          



In [9]:
hypers['lr_schedule']['T'] = 600
hypers['mutate'] = False
model.reset_parameters()
preds, targets, meta = full_train(model, hypers)

=== Training Grecian Hrothgar Harlan ===
Starting at 2021-12-11 10:41:06.806044
12/11/2021 10:41 AM


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.68 GiB total capacity; 17.78 GiB already allocated; 26.56 MiB free; 20.08 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF