In [None]:
import numpy as np
import matplotlib.pylab as plt
from sklearn.preprocessing import StandardScaler
from numba import njit, jit
from numba.experimental import jitclass
import pickle
stand_scaler = StandardScaler()

In [None]:
max_seed = 424242

d_theta = 100
d_w = 50
n_data = 10000
lmb = 0
M = 32

rng = np.random.default_rng(42)
noise_scale = 0.  # overpersonalization

In [None]:
theta = rng.normal(size=d_theta)

def generate_A_b(n, d, noise_scale):
    A = rng.uniform(size=(n, d)) / d
    b = A @ theta
    return A, b


def generate_A_B_c(n, d1, d2, noise_scale):
    A = rng.uniform(size=(n, d1), ) / (d1)
    B = rng.uniform(size=(n, d2)) / (d2)
    x1 = rng.normal(size=d1)
    x2 = rng.normal(size=d2)
    y = A @ theta + B @ x2
    return A, B, y

$$\phi_m(\theta) = \frac{1}{2}\|H_m\theta - b_m\|^2$$
$$\nabla \phi_m(\theta) = H_m^\top(H_m\theta - b_m)$$

In [None]:
class Phi_m:
    def __init__(self, d_theta, n_data, noise_scale):
        self.d = d_theta
        self.n = n_data
        self.noise = noise_scale
        self.Hm, self.bm = generate_A_b(n_data, d_theta, noise_scale)
    
    def func(self, theta):
        return 0.5*sum((self.Hm@theta - self.bm)**2) 
    
    def grad(self, theta):
        return self.Hm.T@(self.Hm@theta - self.bm)

$$
\begin{align}
f_m(\theta, w) &= \phi_m(\theta) + \frac{1}{2}\|A_m\theta+B_mw-y_m\|^2\\
\nabla_1f_m(\theta, w) &= \nabla \phi_m(\theta) + A_m^\top(A_m\theta+B_mw-y_m)\\
\nabla_2f_m(\theta, w) &= B_m^\top(A_m\theta+B_mw-y_m)\\
w_m^*(\theta) &= (B_m^\top B_m)^{-1}(B_m^\top y_m - B_m^\top A_m\theta)
\end{align}
$$

In [None]:
class F_m:
    def __init__(self, d_theta, d_w, n_data, noise_scale, m):
        self.phi_m = Phi_m(d_theta, n_data, noise_scale)
        self.d_theta = d_theta
        self.d_w = d_w
        self.m = m
        self.n = n_data
        self.noise = noise_scale
        self.Am, self.Bm, self.ym = generate_A_B_c(n_data, d_theta, d_w, noise_scale)
    
    def func(self, theta, w):
        return self.phi_m.func(theta) + 0.5*sum((self.Am@theta + self.Bm@w - self.ym)**2)

    def grad_theta(self, theta, w):
        return self.phi_m.grad(theta) + self.Am.T@(self.Am@theta + self.Bm@w - self.ym)

    def grad_w(self, theta, w):
        return self.Bm.T@(self.Am@theta + self.Bm@w - self.ym)

    def opt_w(self, theta):
        A = self.Bm.T@self.Bm
        b = self.Bm.T@(self.ym - self.Am@theta)
        return np.linalg.solve(A, b)
    
    def operator(self, theta):
        w_star = self.opt_w(theta)
        return self.grad_theta(theta, w_star)
    
    def operator_norm_(self, theta):
        return sum(self.operator(theta)**2)

In [None]:
class F:
    def __init__(self, clients, d_theta, M):
        self.clients = clients
        self.M = M
        self.d_theta = d_theta
        
    def operator(self, theta):
        out = np.zeros((self.d_theta,))
        for m in self.clients:
            out += m.operator(theta)
        return out / self.M
    
    
    def operator_norm(self, theta):
        return np.linalg.norm(self.operator(theta))

In [None]:
def compute_L(clients, M, n_data):
    L = 0
    for m in clients:
        L_hat = np.linalg.norm(m.Am.T@(np.eye(n_data) - m.Bm@np.linalg.pinv(m.Bm))@m.Am)
        L_phi = max(np.linalg.eigvals(m.phi_m.Hm.T@m.phi_m.Hm))
        L_max = max(L_hat, L_phi)
        if L_max > L:
            L = L_max
    return L_max

def compute_L_mu(clients):
    for m in clients:
        H = m.Bm.T@m.Bm
        eigs = np.linalg.eigvals(H)
        L = max(eigs)
        mu = min(eigs)
        print(f'client {m.m}', mu, L)

In [None]:
clients = [F_m(d_theta, d_w, n_data, noise_scale, m) for m in range(M)]

In [None]:
theta0 = np.zeros((d_theta,))
w0 = np.zeros((d_w,))

In [None]:
full_op = F(clients, d_theta, M)
full_op.operator_norm(theta0)

In [None]:
def FFGG(theta0, batch_size, lrout, lrin, T, tau, clients, M, d_theta, d_w, n_data, exact_comp=True):
    theta = theta0.copy()
    
    full_op = F(clients, d_theta, M)
    
    history = {'F_norm':[full_op.operator_norm(theta)], 'F_norm_min':[full_op.operator_norm(theta)]}
    history['iter'] = [0]
    print('iteration 0', history['F_norm'][-1])
    
    for t in range(T):
        batch = rng.choice(clients, size=batch_size, replace=False)
        g_theta = np.zeros((d_theta,))
        if exact_comp:
            for m in batch:
                g_theta += m.operator(theta)
        else:
            for m in batch:
                w = rng.normal(size=d_w)
                for l in range(tau):
                    w -= lrin*m.grad_w(theta, w)
                g_theta += m.grad_theta(theta, w)
            
        theta -= lrout/batch_size*g_theta
        
        if t%10==0:
            history['F_norm'].append(full_op.operator_norm(theta))
            history['F_norm_min'].append(min(history['F_norm_min'][-1], history['F_norm'][-1]))
            history['iter'].append(t+1)
            print(f'iteration {t+1}', history['F_norm'][-1])

            with open(f'comparison_cl:{M}_lrout:{lrout}_lrin:{lrin}_dt:{d_theta}_dw:{d_w}_T:{T}_tau:{tau}.pkl', 'wb') as fp:
                pickle.dump(history, fp)
            
    return theta, history

In [None]:
# Bucketing
def bucketing(grad_s, bucket_size=2):
    np.random.shuffle(grad_s)
    n = len(grad_s)
    grad_s_new = []
    for i in range(0, n, bucket_size):
        grad_s_new.append(np.mean(grad_s[i:i+bucket_size], axis=0))
    grad_s_new = np.array(grad_s_new)
    return grad_s_new

# Coordinate Wise Median with bucketing
def cm(grad_s, bucket_size=2):
    n = len(grad_s)
    grad_s = np.array(grad_s)
    grad_s = bucketing(grad_s, bucket_size)
    return np.median(grad_s, axis=0)

def FFGG_w_Byzantine(theta0, batch_size, lrout, lrin, T, tau, clients, M, d_theta, d_w, n_data, exact_comp=True):
    theta = theta0.copy()
    
    full_op = F(clients, d_theta, M)
    
    history = {'F_norm':[full_op.operator_norm(theta)], 'F_norm_min':[full_op.operator_norm(theta)]}
    history['iter'] = [0]
    print('iteration 0', history['F_norm'][-1])
    
    for t in range(T):
        batch = rng.choice(clients, size=batch_size, replace=False)
        g_theta = np.zeros((d_theta,))
        if exact_comp:
            grad_s = [m.operator(theta) for m in batch]
        else:
            grad_s = []
            for m in batch:
                w = rng.normal(size=d_w)
                for l in range(tau):
                    w -= lrin*m.grad_w(theta, w)
                grad_s.append(m.grad_theta(theta, w))
        # add Byzantine as negative gradient (Gradient Flipping Attack)
        for m in rng.choice(clients, size=batch_size//3, replace=False):
            grad_s.append(-m.operator(theta))
        # Robust Aggregation: Coordinate Wise Median with bucketing 
        g_theta = cm(grad_s)
        # g_theta = np.mean(grad_s, axis=0)
        theta -= lrout*g_theta
        
        if t%10==0:
            history['F_norm'].append(full_op.operator_norm(theta))
            history['F_norm_min'].append(min(history['F_norm_min'][-1], history['F_norm'][-1]))
            history['iter'].append(t+1)
            print(f'iteration {t+1}', history['F_norm'][-1])
            print(f"Mean and Variance of the updates: {np.linalg.norm(np.mean(grad_s, axis=0))} / {np.linalg.norm(np.std(grad_s, axis=0))}")

            with open(f'comparison_cl:{M}_lrout:{lrout}_lrin:{lrin}_dt:{d_theta}_dw:{d_w}_T:{T}_tau:{tau}.pkl', 'wb') as fp:
                pickle.dump(history, fp)
            
    return theta, history

In [None]:
L = compute_L(clients, M, n_data)
L, 1/L

In [None]:
compute_L_mu(clients)

In [None]:
batch_size = M
tau = 200
lrout = 1/L
lrin = 1/50
T = 1000

In [None]:
theta_ffgg_full, record_ffgg_full = FFGG(theta0, batch_size, lrout, lrin, 
                               T, tau, clients, M, d_theta, d_w, n_data, exact_comp=True)

In [None]:
theta_ffgg_byz, record_ffgg_byz = FFGG_w_Byzantine(theta0, batch_size, lrout, lrin, 
                               T, tau, clients, M, d_theta, d_w, n_data, exact_comp=True)