**Use CUDA, if you have no GPU, set 'cuda' to 'cpu'**

In [None]:
import torch 
torch.set_default_device("cuda")

**Load data from local files.**

In [17]:

import torch.nn as nn  
import torch.nn.functional as F
import cv2
import numpy as np
import os
import pandas as pd
import pickle

def load_data():
    train_data = []
    train_label = []
    test_data = []
    test_label = []
    with open("./cifar-10-batches-py/data_batch_1", 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
        train_data.extend(dict["data"])
        train_label.extend(dict["labels"])
    with open("./cifar-10-batches-py/data_batch_2", 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
        train_data.extend(dict["data"])
        train_label.extend(dict["labels"])
    with open("./cifar-10-batches-py/data_batch_3", 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
        train_data.extend(dict["data"])
        train_label.extend(dict["labels"])
    with open("./cifar-10-batches-py/data_batch_4", 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
        train_data.extend(dict["data"])
        train_label.extend(dict["labels"])
    with open("./cifar-10-batches-py/data_batch_5", 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
        train_data.extend(dict["data"])
        train_label.extend(dict["labels"])
    with open("./cifar-10-batches-py/test_batch", 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
        test_data.extend(dict["data"])
        test_label.extend(dict["labels"])
    return (torch.tensor(train_data)/255.0, 
    F.one_hot(torch.tensor(train_label), num_classes=10), 
    torch.tensor(test_data)/255.0, 
    F.one_hot(torch.tensor(test_label), num_classes=10))
    return (train_data,train_label,test_data,test_label)

train_data,train_label,test_data,test_label=load_data()
 

  return func(*args, **kwargs)


**Change the label to float**

In [18]:
train_label = train_label *1.0
test_label = test_label *1.0

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.]], device='cuda:0')

**Change the shape of data**

**Images are 32x32 RGB images**

In [19]:
train_data = train_data.reshape((50000,32,32,3))
test_data = test_data.reshape((10000,32,32,3))


**Create Expert module, this is the E of MoE**

**Normally, the model structure does not need to be changed.**

**If you want to change something, please ensure that the shapes are suitable.**

In [20]:
class Expert(nn.Module):  
    def __init__(self, output_dim):  
        super(Expert, self).__init__()  
        self.conv1 = nn.Conv2d(32, 64, (2,1))
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(64, 128, (2,1))
        self.fc1 = nn.Linear(1792, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, output_dim)
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
          
    def forward(self, x):  
        x= F.leaky_relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.leaky_relu(self.bn2(self.conv2(x)))
        x = x.view(-1, 1792)
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        return x

**Create a Sparse Gate module, it is used to select which expert should go to work.**

**In this situation, I created a same model structure with Expert.**

**But I think Sparse Gate should output a sigmoid result in order to make meaning clearly.**

**In the forward(), I made some experts' score to 0 if the calcuated score was lower than given threshold**

In [21]:
class SparseGate(nn.Module):  
    def __init__(self,num_experts, threshold=0.5):  
        super(SparseGate, self).__init__()  
        self.num_experts = num_experts
        self.conv1 = nn.Conv2d(32, 64, (2,1))
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(64, 128, (2,1))
        self.fc1 = nn.Linear(1792, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_experts)
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.threshold = threshold  
          
    def forward(self, x):  
        x= F.leaky_relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.leaky_relu(self.bn2(self.conv2(x)))
        x = x.view(-1, 1792)
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        scores = torch.sigmoid(x)   
        # get k of top k
        k = self.num_experts * self.threshold
        topk,_ = torch.topk(scores, int(k))
        mask = []
        for index in range(self.num_experts):
            #if scores[0][index] > topk.min() or scores[0][index] == topk.min():
            if scores[0][index] > self.threshold:
                mask.append(scores[0][index])
            else:
                mask.append(0.0)
        mask =torch.tensor(mask)
        # mask all zero scores
        return mask

**This module is a fake module.**

**It is useful even be fake, I use it to replace the Real Expert with 0 score from Sparse Gate module.**

**It will never joint to the calculation.**

In [22]:
class ExpertMask(nn.Module): 
    def __init__(self, output_dim):  
        super(ExpertMask, self).__init__()
        self.output_dim = output_dim
        self.fl = nn.Flatten()
        self.mask = nn.Linear(1024*3, output_dim)
    def forward(self, x):  
        v = self.mask(self.fl(x))*0.0
        return v

**This is the main dispatcher module of MoE.**

**It calls Sparse Gate to calculate experts's score. **

**It calls each activated expert to work.**

**It calculates the final result.**

**This is a multiple tags task. So the result should be calculated with Sigmoid.**

In [58]:
class MoE(nn.Module):  
    def __init__(self,  output_dim, num_experts, threshold=0.5):  
        super(MoE, self).__init__()  
        self.experts = nn.ModuleList([Expert( output_dim) for _ in range(num_experts)])  
        self.gate = SparseGate(num_experts, threshold)  
        self.num_experts = num_experts
        self.output_dim =output_dim
          
    def human_setup_compute(self, x, human_set_gate_factors = None):
        if human_set_gate_factors is not None:
            expert_scores = human_set_gate_factors
        activated_experts  = []
        for expert_index in range(self.num_experts):
            if expert_scores[expert_index] > torch.tensor(0.0):
                activated_experts.append(self.experts[expert_index])
            else:
                activated_experts.append(ExpertMask(self.output_dim))
        expert_outputs = torch.stack([expert(x) if isinstance(expert, Expert) else torch.zeros((x.shape[0],self.output_dim)) for expert in activated_experts], dim=1) 
        
        gate_outputs = expert_scores.unsqueeze(-1) 
        
        final_output = torch.sum(gate_outputs * expert_outputs, dim=1) 
        final_output = F.sigmoid(final_output)
        #print(final_output)
        return final_output  

    def forward(self, x):  
        expert_scores = self.gate(x)  
        activated_experts  = []
        for expert_index in range(self.num_experts):
            if expert_scores[expert_index] > torch.tensor(0.0):
                activated_experts.append(self.experts[expert_index])
            else:
                activated_experts.append(ExpertMask(self.output_dim))
        expert_outputs = torch.stack([expert(x) if isinstance(expert, Expert) else torch.zeros((x.shape[0],self.output_dim)) for expert in activated_experts], dim=1)
        gate_outputs = expert_scores.unsqueeze(-1) 
        
        final_output = torch.sum(gate_outputs * expert_outputs, dim=1) 
        final_output = F.sigmoid(final_output)
        #print(final_output)
        return final_output  

**I defined a loss function.**

**But it is useless now. I changed to BCELoss in the training work.**

In [24]:
class OneHotLoss(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x,y):
        return torch.sum(torch.abs(x-y))


**You can set number of experts, it will be perfect if you set a lot of experts, but the whole model will be heavy with more parameters.**

**batch_size, you know, it can improve the training, faster and more accurate**

**threshold means the activation condition. if the sparse gate give a score less than it, the target expert will be replaced by ExpertMask, that fake Expert module**

**Learning Rate is set to 0.001, if you changed the modules before, but training convergence is too slow, you can change it.**

In [81]:
def train(train_data: torch.Tensor, train_label: torch.Tensor, test_data: torch.Tensor, test_label: torch.Tensor):
    output_dim = 10  
    num_experts =20
    threshold = 0.5
    batch_size = 100
    #model = Expert(output_dim)
    model = MoE(output_dim, num_experts, threshold)  
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_function = nn.BCELoss()
    loss_sum = 0
    for epoch in range(10):
        for step in range(0, 50000, batch_size):
            optimizer.zero_grad()
            step_result = model(train_data[step:step+batch_size])
            loss = loss_function(step_result, train_label[step:step+batch_size])
            loss_sum = loss_sum+ loss.item()
            loss.backward() 
            optimizer.step() 
            #print([x.grad for x in optimizer.param_groups[0]["params"]])
            # for params in model.parameters():
            #     print(f"G: {params.grad}")
        print(f'E:{epoch} {loss_sum/500}')
        loss_sum = 0
    return model
                 


**Start to train**

In [82]:
trained_model = train(train_data, train_label, test_data, test_label)

E:0 0.2285371019244194
E:1 0.18469052982330322
E:2 0.16142360980808734
E:3 0.1393040187060833
E:4 0.1167462460398674
E:5 0.09433456543833017
E:6 0.07594361340999603
E:7 0.06724514252319932
E:8 0.060273028373718264
E:9 0.05009632034227252


**Set the index of testing picture.**

In [83]:
index =78

**try the model**

**First line: Sparse Gate result.**

**Second line: Prediction tags of the model**

**Third line: Actual tags**

In [84]:
with torch.no_grad():
    print(trained_model.gate(test_data[index].reshape((1,32,32,3))))
    print(trained_model(test_data[index].reshape((1,32,32,3))))
    print(test_label[index])

tensor([0.0000, 0.0000, 0.5315, 0.0000, 0.5111, 0.0000, 0.0000, 0.0000, 0.5145,
        0.5742, 0.5114, 0.0000, 0.0000, 0.0000, 0.0000, 0.5085, 0.5149, 0.5038,
        0.0000, 0.5587], device='cuda:0')
tensor([[8.6667e-04, 3.6607e-01, 1.9934e-02, 7.4488e-01, 2.2469e-04, 4.7849e-05,
         3.3677e-03, 3.7516e-05, 2.6871e-02, 6.3244e-01]], device='cuda:0')
tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], device='cuda:0')


**You can skip the Sparse Gate module and set the scores manually.**

**You may see that:**


**1. The accuracy is not decrease even make some more expert's score to zero.**

**2. If you set some expert's score to higher, the accuracy will be higher or lower.**

**So, you can see that, The Sparse Gate Module should be trained when the expert modules being frozen.**

**This is the MoE training process: a) Whole model. b) Sparse Gate when freeze experts. c)Experts when freeze Sparse Gate**

**Maybe the whole training will include a lot of iterations(a group of a b c)**

In [96]:
with torch.no_grad():
    ll = trained_model.human_setup_compute(test_data[index:index+1], 
    torch.tensor([0.0000, 0.0000, 0.5315, 0.0000, 0.5111, 0.0000, 0.0000, 0.0000, 0.5145,
        0.5742, 0.5114, 0.0000, 0.0000, 0.0000, 0.0000, 0.5085, 0.9149, 0.9038,
        0.0000, 0.2587]))
    print(ll)

tensor([[4.5552e-04, 3.7660e-01, 4.0578e-02, 9.6198e-01, 3.8759e-04, 2.2229e-05,
         5.4764e-03, 3.8834e-05, 2.1863e-02, 5.6670e-01]], device='cuda:0')
