Implemnting a basic how mixture of expert usually works 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

torch.cuda.set_device(2)  # use GPU 2
device = torch.device('cuda:2')

In [2]:
import sys
import os
from subprocess import call
print('_____Python, Pytorch, Cuda info____')
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA RUNTIME API VERSION')
#os.system('nvcc --version')
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('_____nvidia-smi GPU details____')
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('_____Device assignments____')
print('Number CUDA Devices:', torch.cuda.device_count())
print ('Current cuda device: ', torch.cuda.current_device(), ' **May not correspond to nvidia-smi ID above, check visibility parameter')
print("Device name: ", torch.cuda.get_device_name(torch.cuda.current_device()))

_____Python, Pytorch, Cuda info____
__Python VERSION: 3.10.12 (main, Jan 17 2025, 14:35:34) [GCC 11.4.0]
__pyTorch VERSION: 2.6.0+cu124
__CUDA RUNTIME API VERSION
__CUDNN VERSION: 90100
_____nvidia-smi GPU details____
index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
0, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 29739 MiB, 10704 MiB
1, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 40127 MiB, 316 MiB
2, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 1633 MiB, 38810 MiB
3, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 3897 MiB, 36546 MiB
4, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 4987 MiB, 35456 MiB
5, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 4987 MiB, 35456 MiB
6, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 4731 MiB, 35712 MiB
7, NVIDIA A100-PCIE-40GB, 570.133.20, 40960 MiB, 30951 MiB, 9492 MiB
_____Device assignments____
Number CUDA Devices: 8
Current cuda device:  2  **May not correspond to nvidia-smi ID above, check visibility parame

In [10]:
# implementing MOE for just a simple neural network

# Define Expert 
'''Expert as the name specifies, it is the expert model and produce the output of it's weights in the form of probabilies '''
class Expert(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Expert, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        return torch.softmax(self.layer2(x), dim=1)

In [4]:
'''Gating network, what they do is simple predict weight for each expert, determining their contribution to the expert'''

class Gating(nn.Module):
    def __init__(self, input_dim, num_expert, dropout=0.1):
        super(Gating, self).__init__()

        self.layer1 = nn.Linear(input_dim, 128)
        self.dropout1 = nn.Dropout(dropout)

        self.layer2 = nn.Linear(128, 256)
        self.relu1 = nn.GELU()
        self.dropout2 = nn.Dropout(dropout)

        self.layer3 = nn.Linear(256, 512)
        self.relu2 = nn.GELU()
        self.dropout3 = nn.Dropout(dropout)

        self.layer4 = nn.Linear(512, num_expert)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.dropout1(x)

        x = self.layer2(x)
        x = self.relu1(x)
        x = self.dropout2(x)

        x = self.layer3(x)
        x = self.relu2(x)
        x = self.dropout3(x)

        return torch.softmax(self.layer4(x), dim=1)
        

In [19]:
# defining MOE 
'''so it contains multiple trained expert using gating mechnanism, so by adjusting weights and aggregating output, it generates a unified prediction'''

class Mixture_of_Expert(nn.Module):
    def __init__(self, trained_expert):
        super(Mixture_of_Expert, self).__init__()
        self.expert = nn.ModuleList(trained_expert)

        #freeze the expert when MoE is training so that we don't wanna change our prediction
        for experts in self.expert:
            for param in experts.parameters():
                param.requires_grad=False

        num_expert = len(trained_expert)

        # let's say all expert have the same input dim
        input_dim = trained_expert[0].layer1.in_features
        self.gating = Gating(input_dim, num_expert)

    def forward(self, x):
        weights = self.gating(x)
        outputs = torch.stack([experts(x) for experts in self.expert], dim=2) # expert output

        # adjust weights output tensor
        weights = weights.unsqueeze(1).expand_as(outputs)

        # multiply and sum
        return torch.sum(outputs*weights, dim=2)



In [6]:
# Parameters
num_samples = 5000
input_dim = 4

# Generate balanced labels and features
labels_per_class = num_samples // 3
remainder = num_samples % 3
y_data = torch.cat([
    torch.full((labels_per_class,), 0),
    torch.full((labels_per_class,), 1), 
    torch.full((labels_per_class + remainder,), 2)
]).long()
x_data = torch.randn(num_samples, input_dim)

# Add class-specific bias
bias_map = {0: (0, 1.0), 1: (1, -1.0), 2: (0, -1.0)}  # (feature_idx, bias_value)
for label, (feat_idx, bias) in bias_map.items():
    mask = y_data == label
    x_data[mask, feat_idx] += bias

# Shuffle data
perm = torch.randperm(num_samples)
x_data, y_data = x_data[perm], y_data[perm]

# Split data: 40% experts, 32% MoE train, 8% test
split1, split2 = num_samples // 2, int(num_samples * 0.9)

# Expert data (each expert sees 2 out of 3 classes)
expert_masks = [
    (y_data[:split1] == 0) | (y_data[:split1] == 1),  # Expert 1: classes 0,1
    (y_data[:split1] == 1) | (y_data[:split1] == 2),  # Expert 2: classes 1,2  
    (y_data[:split1] == 0) | (y_data[:split1] == 2),  # Expert 3: classes 0,2
]

expert_data = []
min_samples = min(mask.sum() for mask in expert_masks)
for i, mask in enumerate(expert_masks):
    expert_data.append((
        x_data[:split1][mask][:min_samples],
        y_data[:split1][mask][:min_samples]
    ))

# MoE training and test data
x_train_moe, y_train_moe = x_data[split1:split2], y_data[split1:split2]
x_test, y_test = x_data[split2:], y_data[split2:]

# Unpack expert data
(x_expert1, y_expert1), (x_expert2, y_expert2), (x_expert3, y_expert3) = expert_data

print(f"MoE train: {x_train_moe.shape}, Test: {x_test.shape}")
print(f"Experts: {x_expert1.shape}, {x_expert2.shape}, {x_expert3.shape}")


MoE train: torch.Size([2000, 4]), Test: torch.Size([500, 4])
Experts: torch.Size([1639, 4]), torch.Size([1639, 4]), torch.Size([1639, 4])


In [13]:
# Training parameters
output_dim = 3
hidden_dim = 32
epochs = 500
learning_rate = 0.001

# Initialize experts and optimizers
experts = [Expert(input_dim, hidden_dim, output_dim) for _ in range(3)]
optimizers = [optim.Adam(expert.parameters(), lr=learning_rate) for expert in experts]
criterion = nn.CrossEntropyLoss()

# Expert training data (assuming from previous code)
expert_datasets = [(x_expert1, y_expert1), (x_expert2, y_expert2), (x_expert3, y_expert3)]

# Single training loop for all experts
for epoch in tqdm(range(epochs), desc="Training Experts"):
    for i, (expert, optimizer, (x_data, y_data)) in enumerate(zip(experts, optimizers, expert_datasets)):
        optimizer.zero_grad()
        outputs = expert(x_data)
        loss = criterion(outputs, y_data)
        loss.backward()
        optimizer.step()
        
        # Optional: print loss every 50 epochs
        if epoch % 50 == 0:
            print(f"Epoch {epoch}, Expert {i+1} Loss: {loss.item():.4f}")

print("Training completed!")

Training Experts:   6%|▌         | 29/500 [00:00<00:01, 283.27it/s]

Epoch 0, Expert 1 Loss: 1.0991
Epoch 0, Expert 2 Loss: 1.1377
Epoch 0, Expert 3 Loss: 1.1105
Epoch 50, Expert 1 Loss: 0.9584
Epoch 50, Expert 2 Loss: 0.9745
Epoch 50, Expert 3 Loss: 0.9648


Training Experts:  30%|██▉       | 149/500 [00:00<00:01, 291.53it/s]

Epoch 100, Expert 1 Loss: 0.8680
Epoch 100, Expert 2 Loss: 0.8770
Epoch 100, Expert 3 Loss: 0.8364
Epoch 150, Expert 1 Loss: 0.8198
Epoch 150, Expert 2 Loss: 0.8281
Epoch 150, Expert 3 Loss: 0.7658


Training Experts:  48%|████▊     | 239/500 [00:00<00:00, 289.98it/s]

Epoch 200, Expert 1 Loss: 0.7980
Epoch 200, Expert 2 Loss: 0.8070
Epoch 200, Expert 3 Loss: 0.7357
Epoch 250, Expert 1 Loss: 0.7879
Epoch 250, Expert 2 Loss: 0.7971
Epoch 250, Expert 3 Loss: 0.7220


Training Experts:  65%|██████▌   | 326/500 [00:01<00:00, 275.22it/s]

Epoch 300, Expert 1 Loss: 0.7824
Epoch 300, Expert 2 Loss: 0.7916
Epoch 300, Expert 3 Loss: 0.7147


Training Experts:  76%|███████▌  | 381/500 [00:01<00:00, 259.70it/s]

Epoch 350, Expert 1 Loss: 0.7789
Epoch 350, Expert 2 Loss: 0.7880
Epoch 350, Expert 3 Loss: 0.7102


Training Experts:  87%|████████▋ | 434/500 [00:01<00:00, 253.64it/s]

Epoch 400, Expert 1 Loss: 0.7765
Epoch 400, Expert 2 Loss: 0.7855
Epoch 400, Expert 3 Loss: 0.7072


Training Experts: 100%|██████████| 500/500 [00:01<00:00, 270.62it/s]

Epoch 450, Expert 1 Loss: 0.7750
Epoch 450, Expert 2 Loss: 0.7839
Epoch 450, Expert 3 Loss: 0.7049
Training completed!





In [20]:
moe_model = Mixture_of_Expert(experts)  # Pass the trained experts list
optimizer_moe = optim.Adam(moe_model.parameters(), lr=learning_rate)

print("Training MoE model...")
for epoch in tqdm(range(epochs), desc="Training MoE"):
    optimizer_moe.zero_grad()
    outputs_moe = moe_model(x_train_moe)
    loss_moe = criterion(outputs_moe, y_train_moe)
    loss_moe.backward()
    optimizer_moe.step()
    
    # Optional: print loss every 50 epochs
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, MoE Loss: {loss_moe.item():.4f}")

print("MoE training completed!")

Training MoE model...


Training MoE:   1%|          | 6/500 [00:00<00:08, 58.87it/s]

Epoch 0, MoE Loss: 0.9517


Training MoE:  12%|█▏        | 60/500 [00:00<00:06, 70.15it/s]

Epoch 50, MoE Loss: 0.8781


Training MoE:  23%|██▎       | 115/500 [00:01<00:05, 73.58it/s]

Epoch 100, MoE Loss: 0.8706


Training MoE:  33%|███▎      | 163/500 [00:02<00:04, 70.76it/s]

Epoch 150, MoE Loss: 0.8633


Training MoE:  42%|████▏     | 211/500 [00:02<00:04, 70.25it/s]

Epoch 200, MoE Loss: 0.8587


Training MoE:  52%|█████▏    | 258/500 [00:03<00:03, 69.71it/s]

Epoch 250, MoE Loss: 0.8594


Training MoE:  63%|██████▎   | 314/500 [00:04<00:02, 71.00it/s]

Epoch 300, MoE Loss: 0.8568


Training MoE:  72%|███████▏  | 362/500 [00:05<00:01, 70.87it/s]

Epoch 350, MoE Loss: 0.8559


Training MoE:  82%|████████▏ | 410/500 [00:05<00:01, 70.60it/s]

Epoch 400, MoE Loss: 0.8541


Training MoE:  92%|█████████▏| 460/500 [00:06<00:00, 77.83it/s]

Epoch 450, MoE Loss: 0.8542


Training MoE: 100%|██████████| 500/500 [00:06<00:00, 71.43it/s]

MoE training completed!





In [22]:
def evaluate(model, x, y):
    model.eval()  # Set to evaluation mode
    with torch.no_grad():
        outputs = model(x)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y).float().mean().item()
    return accuracy

# Evaluate all models
models = {'Expert 1': experts[0], 'Expert 2': experts[1], 'Expert 3': experts[2], 'Mixture_of_Expert': moe_model}
results = {name: evaluate(model, x_test, y_test) for name, model in models.items()}

# Print results
for name, accuracy in results.items():
    print(f"{name} Accuracy: {accuracy:.4f}")

Expert 1 Accuracy: 0.4720
Expert 2 Accuracy: 0.4920
Expert 3 Accuracy: 0.5740
Mixture_of_Expert Accuracy: 0.6420
