In [3]:
!pip install numpy 

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy
  Using cached numpy-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Installing collected packages: numpy
Successfully installed numpy-2.2.2


In [5]:
!pip install scikit-learn  

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 KB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting scipy>=1.6.0
  Downloading scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (40.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed 

In [15]:
!pip install matplotlib seaborn 

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting contourpy>=1.0.1
  Downloading contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [16]:
import numpy as np 
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


In [8]:
class Expert:
    def __init__(self, input_dim, hidden_dim, output_dim):
        self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2.0/input_dim)
        self.b1 = np.zeros(hidden_dim)
        self.W2 = np.random.randn(hidden_dim, output_dim) * np.sqrt(2.0/hidden_dim)
        self.b2 = np.zeros(output_dim)
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def forward(self, x):
        hidden = np.dot(x, self.W1) + self.b1
        hidden = self.relu(hidden)
        output = np.dot(hidden, self.W2) + self.b2
        return output
    
    def get_gradients(self, x, grad_output):
        hidden_pre = np.dot(x, self.W1) + self.b1
        hidden = self.relu(hidden_pre)
        
        grad_W2 = np.dot(hidden.T, grad_output)
        grad_b2 = np.sum(grad_output, axis=0)
        
        grad_hidden = np.dot(grad_output, self.W2.T)
        grad_hidden[hidden_pre <= 0] = 0 
        
        grad_W1 = np.dot(x.T, grad_hidden)
        grad_b1 = np.sum(grad_hidden, axis=0)
        
        return {'W1': grad_W1, 'b1': grad_b1, 'W2': grad_W2, 'b2': grad_b2}
    
    def update_params(self, grads, learning_rate):
        self.W1 -= learning_rate * grads['W1']
        self.b1 -= learning_rate * grads['b1']
        self.W2 -= learning_rate * grads['W2']
        self.b2 -= learning_rate * grads['b2']


In [9]:
class Gating:
    def __init__(self, input_dim, num_experts):
        self.W = np.random.randn(input_dim, num_experts) * np.sqrt(2.0/input_dim)
        self.b = np.zeros(num_experts)
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward(self, x):
        logits = np.dot(x, self.W) + self.b
        return self.softmax(logits)
    
    def get_gradients(self, x, grad_output):
        grad_W = np.dot(x.T, grad_output)
        grad_b = np.sum(grad_output, axis=0)
        return {'W': grad_W, 'b': grad_b}
    
    def update_params(self, grads, learning_rate):
        self.W -= learning_rate * grads['W']
        self.b -= learning_rate * grads['b']

In [10]:
class MixtureOfExperts:
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
        self.experts = [Expert(input_dim, hidden_dim, output_dim) 
                       for _ in range(num_experts)]
        self.gating = Gating(input_dim, num_experts)
        self.num_experts = num_experts
    
    def forward(self, x):
        expert_outputs = np.stack([expert.forward(x) for expert in self.experts])
        expert_outputs = np.transpose(expert_outputs, (1, 0, 2))
        gating_weights = self.gating.forward(x)
        final_output = np.sum(expert_outputs * gating_weights[..., np.newaxis], axis=1)
        return final_output, gating_weights, expert_outputs
    
    def train_step(self, x, y, learning_rate):
        output, gating_weights, expert_outputs = self.forward(x)
        loss = np.mean((output - y) ** 2)
        batch_size = x.shape[0]
        grad_output = 2 * (output - y) / batch_size
        expert_grads = []
        for i in range(self.num_experts):
            expert_grad = grad_output * gating_weights[:, i:i+1]
            expert_grads.append(self.experts[i].get_gradients(x, expert_grad))
        
        grad_gating = np.sum(grad_output[..., np.newaxis] * expert_outputs, axis=2)
        gating_grads = self.gating.get_gradients(x, grad_gating)
        
        for i, expert in enumerate(self.experts):
            expert.update_params(expert_grads[i], learning_rate)
        self.gating.update_params(gating_grads, learning_rate)
        
        return loss

In [11]:
def load_wine_quality():
    wine = load_wine()
    X = wine.data
    y = wine.target.reshape(-1, 1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [12]:
def train_moe(model, X_train, y_train, epochs=100, learning_rate=0.01, batch_size=32):
    n_samples = X_train.shape[0]
    losses = []
    
    for epoch in range(epochs):
        indices = np.random.permutation(n_samples)
        X_shuffled = X_train[indices]
        y_shuffled = y_train[indices]
        
        epoch_losses = []
        for i in range(0, n_samples, batch_size):
            X_batch = X_shuffled[i:i + batch_size]
            y_batch = y_shuffled[i:i + batch_size]
            
            loss = model.train_step(X_batch, y_batch, learning_rate)
            epoch_losses.append(loss)
        
        avg_loss = np.mean(epoch_losses)
        losses.append(avg_loss)
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")
    
    return losses


In [13]:
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_wine_quality()
    input_dim = X_train.shape[1]
    hidden_dim = 32
    output_dim = 1
    num_experts = 3
    model = MixtureOfExperts(input_dim, hidden_dim, output_dim, num_experts)
    losses = train_moe(model, X_train, y_train, epochs=100, learning_rate=0.01)
    test_output, _, _ = model.forward(X_test)
    test_mse = np.mean((test_output - y_test) ** 2)
    print(f"Test MSE: {test_mse:.4f}")

Epoch 0, Loss: 1.4366
Epoch 10, Loss: 0.2098
Epoch 20, Loss: 0.1408
Epoch 30, Loss: 0.0993
Epoch 40, Loss: 0.0888
Epoch 50, Loss: 0.0720
Epoch 60, Loss: 0.0645
Epoch 70, Loss: 0.0557
Epoch 80, Loss: 0.0614
Epoch 90, Loss: 0.0491
Test MSE: 0.0706
