In [13]:
import pickle
import torch
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import time
import matplotlib.pyplot as plt
import numpy
import pandas as pd

In [3]:
router_data = pickle.load(open("../swinmoe_router.pkl.0", "rb"))

In [4]:
layers = [(2,1),(2,3),(2,5),(2,7),(2,9), (2,11), (2,13), (2,15), (2,17), (3,1)]

In [26]:
len(router_data[0]['expert'][(2,1)][0])

18432

In [5]:
def router_data_to_seqs(router_data, layers, batch_size=128):
    seqs = []
    for batch_id, _ in tqdm(enumerate(router_data)):
        seq = [[] for i in range(batch_size)]
        for layer_id in layers:
            layer_img_size = (192//(4*(2**layer_id[0])))**2
            img_layer = [[] for i in range(batch_size)]
            for token_id, _ in enumerate(router_data[batch_id]['expert'][layer_id][0]):
                img_layer[token_id//layer_img_size].append(router_data[batch_id]['expert'][layer_id][0][token_id])
            
            assert len(img_layer[0]) == len(img_layer[-1])

            for i in range(batch_size):
                seq[i].append(img_layer[i])
        seqs.extend(seq)
    return seqs

In [6]:
seqs = router_data_to_seqs(router_data, layers)

500it [06:03,  1.38it/s]


In [89]:
def cmp(pred_y, test_y):
    if type(pred_y) == numpy.ndarray:
        assert(len(pred_y) == len(test_y))
        positive = 0
        for i, _ in enumerate(pred_y):
            if pred_y[i] == test_y[i]:
                positive += 1
        return float(positive) / len(pred_y)
    else:
        return float(pred_y == test_y)

def train(seq_train_X, seq_train_Y, ckpt_path = None):
    if ckpt_path and os.path.isfile(ckpt_path):
        print(f"load checkpoint from {ckpt_path}")
        model = pickle.load(open(ckpt_path, 'rb'))
    else:
        print(f"no checkpoint, build model from scratch")
        model = KNeighborsClassifier(n_jobs=-1)
    
    print("Start Training")
    model.fit(seq_train_X, seq_train_Y)
    
    print(f"Training Done, saving checkpoint to {ckpt_path}")
    pickle.dump(model, open(ckpt_path, "wb"))
    
def eval(seq_test_X, seq_test_Y, ckpt_path = None):
    if ckpt_path and os.path.isfile(ckpt_path):
        print(f"load checkpoint from {ckpt_path}")
        model = pickle.load(open(ckpt_path, 'rb'))
    else:
        raise ValueError
    print("Start Eval")
    score = 0
    pred_Y = model.predict(seq_test_X)
    for pred_id, _ in enumerate(seq_test_X):
        score += cmp(pred_Y[pred_id], seq_test_Y[pred_id])
    print(f" Eval Score: {score/len(seq_test_Y)}")
    return score

In [49]:
def preprocess_data(seqs, lag, train_size=0.5):
    # Note: the first element is token_tensor
    seq_train, seq_test = train_test_split(seqs, train_size = train_size)
    seq_train_X = [ x[:len(x)-lag] for x in seq_train]
    seq_train_Y = [ x[len(x)-lag] for x in seq_train]
    seq_test_X = [ x[:len(x)-lag] for x in seq_test]
    seq_test_Y = [ x[len(x)-lag] for x in seq_test]
    return seq_train_X, seq_train_Y, seq_test_X, seq_test_Y

In [53]:
result = {}
for lag in range(1,2):
    seq_train_X, seq_train_Y, seq_test_X, seq_test_Y = preprocess_data(seqs, lag)    
    train(seq_train_X, seq_train_Y, ckpt_path = f"swinmoe_lag_{lag}.pkl")
    # score = eval(seq_test_X, seq_test_Y, ckpt_path = f"model_mlp_with_input_lag_{lag}.pkl")
    score = eval(seq_test_X, seq_test_Y, ckpt_path = f"swinmoe_lag_{lag}.pkl")
    result[lag] = score

no checkpoint, build model from scratch
Start Training


ValueError: Found array with dim 3. Estimator expected <= 2.

In [81]:
seq_train_X_2 = [  sum(x, [])  for x in seq_train_X]
train(seq_train_X_2, seq_train_Y, ckpt_path = f"swinmoe_lag_{lag}.pkl")

no checkpoint, build model from scratch
Start Training
Training Done, saving checkpoint to swinmoe_lag_1.pkl


In [92]:
seq_test_X_2 = [  sum(x, [])  for x in seq_test_X]
score = eval(seq_test_X_2, seq_test_Y, ckpt_path = f"swinmoe_lag_{lag}.pkl")
score/len(seq_test_X_2)

load checkpoint from swinmoe_lag_1.pkl
Start Eval
 Eval Score: 0.20946636284722336


0.20946636284722336

In [1]:
# for image_id in range(10):
#     for layer_id, layer in enumerate(layers):
#         dim = 12 if layer[0] == 2 else 6
#         x = [[seqs[image_id][layer_id][i*dim+j] for j in range(dim)] for i in range(dim)]
#         plt.figure()
#         cax = plt.imshow(x)
#         cbar = plt.colorbar(cax, extend='both', drawedges = False)
#         plt.title(f'layer{layer}')
#         plt.savefig(f'swinmoe_token/image{image_id}.{layer}.png')

In [7]:
layer_stat = [ [] for i in range(len(layers))]
for image_id in range(len(seqs)):
    for layer_id, layer in enumerate(layers):
        dim = 12 if layer[0] == 2 else 6
        num_active_expert = len(set([int(x) for x in seqs[image_id][layer_id]]))
        layer_stat[layer_id].append(num_active_expert)

In [8]:
# for layer_id, _ in enumerate(layers):
#     plt.figure(figsize=[20,3])
#     plt.hist(layer_stat[layer_id], bins = 16)
#     plt.xticks([i for i in range(16)])
#     plt.show()

In [11]:
layer_expert_stat = [ [[] for j in range(16)] for i in range(len(layers))]
for image_id in range(1000):
    for layer_id, layer in enumerate(layers):
        dim = 12 if layer[0] == 2 else 6
        for i in range(16):
            layer_expert_stat[layer_id][i].append(0)
        for x in seqs[image_id][layer_id]:
            layer_expert_stat[layer_id][x][-1] += 1

In [1]:
# for layer_id, _ in enumerate(layers):
#     xs, ys = [], []
#     for expert_id in range(16):
#         xs.extend([expert_id]*len(layer_expert_stat[layer_id][expert_id]))
#         ys.extend(layer_expert_stat[layer_id][expert_id])
#     plt.figure()
#     plt.violinplot([[y for y in x if y>0] for x in layer_expert_stat[layer_id]])
#     plt.show()

In [39]:
max_capacity = 144 
n_shapes = 64
break_method = 'optimal'

In [40]:
layer_stat = [ [] for i in range(len(layers))]
for layer_id in range(len(layers)):
    agg_expert = []
    for expert_id in range(16):
        agg_expert.extend([min(x, max_capacity) for x in layer_expert_stat[layer_id][expert_id] if x > 0])
    shape_count = pd.value_counts(agg_expert)
    shapes = sorted(shape_count.keys())
    layer_stat[layer_id] = [(s, shape_count[s]) for s in shapes]

In [43]:
def find_optimal_breakpoints(layer_stat, n_shapes):
    max_shape = max([x[0] for x in layer_stat])
    if n_shapes > max_shape:
        n_shapes = max_shape
    dp = [[0 for i in range(n_shapes)] for j in range(max_shape+1)]
    transfer = [[None for i in range(n_shapes)] for j in range(max_shape+1)]
    for cur_shape in tqdm(range(max_shape+1)):
        for shape_id in range(n_shapes):
            min_cost = 1e100
            min_pre_shape = None
            if shape_id == 0:
                min_cost = sum(x[1]*(cur_shape-x[0]) for x in layer_stat if x[0]<=cur_shape)
            else:
                for pre_shape in range(cur_shape):
                    cost = dp[pre_shape][shape_id-1] + \
                        sum([x[1]*(cur_shape - x[0]) for x in layer_stat if x[0] > pre_shape and x[0] <= cur_shape])
                    if cost < min_cost:
                        min_cost = cost
                        min_pre_shape = pre_shape
            dp[cur_shape][shape_id] = min_cost
            transfer[cur_shape][shape_id] = min_pre_shape
    breakpoints = [max_shape]
    cur_shape = max_shape
    cur_id = n_shapes-1
    while cur_id > 0:
        cur_shape = transfer[cur_shape][cur_id]
        cur_id -= 1
        if cur_id >= 0:
            breakpoints.append(cur_shape)
    breakpoints.reverse()
    return breakpoints

In [44]:
import math
print(max_capacity, n_shapes)
breakpoints = [[] for i in range(len(layers))]
if break_method == 'optimal':
    for layer_id in range(len(layers)):
        breakpoints[layer_id] = find_optimal_breakpoints(layer_stat[layer_id], n_shapes)
elif break_method == 'uniform':
    for layer_id in range(len(layers)):
        for i in range(1, n_shapes):
            breakpoints[layer_id].append(math.floor(i*max_capacity/float(n_shapes-1)))
        breakpoints[layer_id].append(max_capacity)
print(breakpoints)

144 64


100%|██████████| 143/143 [00:13<00:00, 10.36it/s]
100%|██████████| 127/127 [00:08<00:00, 15.49it/s]
100%|██████████| 113/113 [00:06<00:00, 16.44it/s]
100%|██████████| 127/127 [00:09<00:00, 13.12it/s]
100%|██████████| 135/135 [00:11<00:00, 11.71it/s]
100%|██████████| 134/134 [00:11<00:00, 11.32it/s]
100%|██████████| 126/126 [00:09<00:00, 12.68it/s]
100%|██████████| 114/114 [00:07<00:00, 15.46it/s]
100%|██████████| 118/118 [00:08<00:00, 14.72it/s]
100%|██████████| 32/32 [00:00<00:00, 282.47it/s]

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 43, 45, 47, 49, 51, 53, 55, 58, 60, 63, 66, 69, 73, 78, 82, 87, 93, 96, 103, 111, 117, 126, 132, 142], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 52, 54, 56, 58, 60, 63, 65, 70, 75, 78, 82, 87, 90, 98, 126], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 68, 72, 76, 79, 83, 89, 97, 112], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 45, 47, 49, 50, 52, 54, 56, 59, 61, 64, 66, 70, 73, 77, 81, 86, 91, 97, 101, 110, 11




In [45]:
import bisect
enable_bmm = False
results = []
for layer_id in range(len(layers)):
    for sentense_id, _ in enumerate(layer_expert_stat[layer_id][0]):
        max_batch_size = max([layer_expert_stat[layer_id][expert_id][sentense_id] for expert_id in range(16)])
        max_batch_size = min(max_capacity, max_batch_size)
        sentense_token = sum([layer_expert_stat[layer_id][i][sentense_id] for i in range(16)])
        dim = 12 if layers[layer_id][0] == 2 else 6
        assert(sentense_token == dim*dim)
        padding_bmm = sum([max(0, max_batch_size - layer_expert_stat[layer_id][expert_id][sentense_id]) for expert_id in range(16)])
        padding_brt = 0
        use_bmm = False
        for expert_id in range(len(layers)):
            expert_token = min(max_capacity, layer_expert_stat[layer_id][expert_id][sentense_id])
            kernel_idx = bisect.bisect_right(breakpoints[layer_id], expert_token)
            if kernel_idx >= len(breakpoints[layer_id]) or (enable_bmm and use_bmm):
                kernel_shape = max_batch_size
                use_bmm = True
            else:
                kernel_shape = breakpoints[layer_id][kernel_idx]
            assert(kernel_shape >= expert_token)
            padding_brt += kernel_shape - expert_token
        if enable_bmm and use_bmm:
            padding_brt = padding_bmm
        results.append((use_bmm, padding_bmm, padding_brt, dim*dim))

In [46]:
len([x for x in results if x[0]])
sum([x[1] for x in results])/sum([x[3] for x in results]), sum([x[2] for x in results])/sum([x[3] for x in results])

(4.947255255255255, 0.07776576576576577)