In [1]:
import socket
import struct
import time
import scipy.io
import numpy as np
from functools import partial
from concurrent.futures import ProcessPoolExecutor
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import DirichletPartitioner
from torchvision.transforms import ToTensor
from flwr_datasets.visualization import plot_label_distributions
from numba import njit, jit
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from cython_decoder import cython_sc_decoding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
host = '127.0.0.1'
port = 5000
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind((host, port))
num_nodes = 5

In [None]:
num_nodes = 20
server_socket.listen((num_nodes+1)*2)
node_s = []
node_r = []

try:
    while True:
        client_socket, addr = server_socket.accept()
        server_socket.settimeout(10)
        data = client_socket.recv(1024).decode()
        if data == "Server-R":
            server_s = client_socket
        elif data == "Server-S":
            server_r = client_socket
        elif data == "Node-R":
            node_s.append(client_socket)
        elif data == "Node-S":
            node_r.append(client_socket)
        client_socket.sendall(struct.pack('I',len(b"start"))+b"start")
except socket.timeout:
    print('Timeout')
    server_socket.settimeout(None)

for tmp_socket in node_r:
    tmp_socket.recv(1024)
server_r.recv(65536)

In [None]:
for tmp_socket in node_r:
    tmp_socket.close()
for tmp_socket in node_s:
    tmp_socket.close()
server_s.close()
server_r.close()

In [53]:
fds = FederatedDataset(
    dataset="cifar10",
    partitioners={
        "train": DirichletPartitioner(
            num_partitions=50,
            partition_by="label",
            alpha=0.1,
            seed=42,
            min_partition_size=0,
        ),
    },
)

In [54]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.Resize(32),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

def train_transforms(batch):
  transforms = transform_train
  batch["img"] = [transforms(img) for img in batch["img"]]
  return batch

def test_transforms(batch):
    transforms = transform_test
    batch["img"] = [transforms(img) for img in batch["img"]]
    return batch

train_loader=[]
test_loader=[]
for i in range(50):
    partition_train_test = fds.load_partition(i, "train").train_test_split(0.1)
    partition_train = partition_train_test["train"].with_transform(train_transforms)
    partition_test = partition_train_test["test"].with_transform(test_transforms)
    # centralized_dataset = fds.load_split("test").with_transform(test_transforms)
    train_loader.append(DataLoader(partition_train, batch_size=256, shuffle=True, num_workers=16))
    test_loader.append(DataLoader(partition_test, batch_size=128, shuffle=False, num_workers=16))

In [56]:
from models.vit_small import ViT
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = []
optimizer = []
scheduler = []
criterion = []
scaler = []
for i in range(num_nodes):
    net.append(ViT(
        image_size = 32,
        patch_size = 4,
        num_classes = 10,
        dim = 32,
        depth = 6,
        heads = 8,
        mlp_dim = 32,
        dropout=0.1,
        emb_dropout=0.1
    ).to(device))


    optimizer.append(optim.Adam(net[i].parameters(), lr=0.001))
    scheduler.append(torch.optim.lr_scheduler.CosineAnnealingLR(optimizer[i], 5))
    criterion.append(nn.CrossEntropyLoss())
    scaler.append(torch.cuda.amp.GradScaler(enabled=True))

server_net = ViT(
    image_size = 32,
    patch_size = 4,
    num_classes = 10,
    dim = 32,
    depth = 6,
    heads = 8,
    mlp_dim = 32,
    dropout=0.1,
    emb_dropout=0.1
).to(device)

In [57]:
def train_model(model: nn.Module, 
                train_loader: DataLoader, 
                criterion: nn.Module, 
                device: torch.device, 
                scaler: torch.cuda.amp.GradScaler, 
                optimizer: torch.optim.Optimizer,
                epoch: int,
                nodes: int):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs = batch["img"].to(device)
        labels = batch["label"].to(device)
        with torch.cuda.amp.autocast(enabled=True):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()
        total_samples += labels.size(0)
        _, preds = torch.max(outputs, 1)
        total_correct += (preds == labels).sum().item()
    print(f"Nodes: {nodes}, Epoch: {epoch},Train Loss: {total_loss / total_samples:.4f}, Train Accuracy: {total_correct / total_samples:.4f}")

def evaluate_model(model: nn.Module, 
                   test_loader: DataLoader, 
                   criterion: nn.Module, 
                   device: torch.device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch["img"].to(device)
            labels = batch["label"].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            total_samples += labels.size(0)
            _, preds = torch.max(outputs, 1)
            total_correct += (preds == labels).sum().item()
    print(f"Validation Loss: {total_loss / total_samples:.4f}, Validation Accuracy: {total_correct / total_samples:.4f}\n\t")

In [71]:
for cli in range(0,1):
    start_time = time.time()
    for i in range(100):
        train_model(net[cli], train_loader[cli], criterion[cli], device, scaler[cli], optimizer[cli], i, cli)
        # evaluate_model(net[cli], test_loader[cli], criterion[cli], device)
        scheduler[cli].step()
    print(f"Time taken: {time.time()-start_time}")
    scheduler[cli] = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer[cli], 5)
    

Nodes: 0, Epoch: 0,Train Loss: 0.0060, Train Accuracy: 0.4276
Nodes: 0, Epoch: 1,Train Loss: 0.0060, Train Accuracy: 0.4252
Nodes: 0, Epoch: 2,Train Loss: 0.0060, Train Accuracy: 0.4336
Nodes: 0, Epoch: 3,Train Loss: 0.0060, Train Accuracy: 0.4216
Nodes: 0, Epoch: 4,Train Loss: 0.0060, Train Accuracy: 0.4222
Nodes: 0, Epoch: 5,Train Loss: 0.0059, Train Accuracy: 0.4216
Nodes: 0, Epoch: 6,Train Loss: 0.0060, Train Accuracy: 0.4192
Nodes: 0, Epoch: 7,Train Loss: 0.0060, Train Accuracy: 0.4216
Nodes: 0, Epoch: 8,Train Loss: 0.0059, Train Accuracy: 0.4192
Nodes: 0, Epoch: 9,Train Loss: 0.0058, Train Accuracy: 0.4312
Nodes: 0, Epoch: 10,Train Loss: 0.0057, Train Accuracy: 0.4360
Nodes: 0, Epoch: 11,Train Loss: 0.0057, Train Accuracy: 0.4312
Nodes: 0, Epoch: 12,Train Loss: 0.0057, Train Accuracy: 0.4498
Nodes: 0, Epoch: 13,Train Loss: 0.0056, Train Accuracy: 0.4547
Nodes: 0, Epoch: 14,Train Loss: 0.0055, Train Accuracy: 0.4703
Nodes: 0, Epoch: 15,Train Loss: 0.0054, Train Accuracy: 0.4661
No

In [8]:
weights_dict = {name: param.cpu().detach().numpy() for name, param in net[0].state_dict().items()}

In [9]:
list(weights_dict.keys())

['pos_embedding',
 'cls_token',
 'to_patch_embedding.to_patch_tokens.1.weight',
 'to_patch_embedding.to_patch_tokens.1.bias',
 'to_patch_embedding.to_patch_tokens.2.weight',
 'to_patch_embedding.to_patch_tokens.2.bias',
 'transformer.layers.0.0.norm.weight',
 'transformer.layers.0.0.norm.bias',
 'transformer.layers.0.0.fn.temperature',
 'transformer.layers.0.0.fn.to_qkv.weight',
 'transformer.layers.0.0.fn.to_out.0.weight',
 'transformer.layers.0.0.fn.to_out.0.bias',
 'transformer.layers.0.1.norm.weight',
 'transformer.layers.0.1.norm.bias',
 'transformer.layers.0.1.fn.net.0.weight',
 'transformer.layers.0.1.fn.net.0.bias',
 'transformer.layers.0.1.fn.net.3.weight',
 'transformer.layers.0.1.fn.net.3.bias',
 'transformer.layers.1.0.norm.weight',
 'transformer.layers.1.0.norm.bias',
 'transformer.layers.1.0.fn.temperature',
 'transformer.layers.1.0.fn.to_qkv.weight',
 'transformer.layers.1.0.fn.to_out.0.weight',
 'transformer.layers.1.0.fn.to_out.0.bias',
 'transformer.layers.1.1.norm.we

In [10]:
def numpy_to_bit(array: np.ndarray) -> np.ndarray:
    tmp_byte=np.frombuffer(array.tobytes(),dtype=np.uint8)
    bit_stream = ''.join(format(byte, '08b') for byte in tmp_byte)
    bit_array = np.array([int(bit) for bit in bit_stream], dtype=np.int8)
    return bit_array

def bit_to_numpy(bit_array: np.ndarray) -> np.ndarray:
    bit_stream = ''.join(str(int(bit)) for bit in bit_array)
    byte_array_back = np.array([int(bit_stream[i:i+8], 2) for i in range(0, len(bit_stream), 8)], dtype=np.uint8)
    int_array_back = np.frombuffer(byte_array_back.tobytes(), dtype=np.float32)
    return int_array_back

'''
Encoding and decoding function
'''
@jit(nopython=True)
def encode(u: np.ndarray) -> np.ndarray:
    N = u.shape[0]  # Get the length of u
    n = int(np.log2(N))  # Calculate the log base 2 of N

    if n == 1:
        x = np.array([(u[0] + u[1]) % 2, u[1]],dtype=np.int8)
        return x
    else:
        x1 = encode(np.mod(u[:N//2] + u[N//2:], 2))
        x2 = encode(u[N//2:])
        x = np.concatenate((x1, x2))
        return x

@jit(nopython=True)
def rvsl(y: np.ndarray) -> np.ndarray:
    N = y.shape[0]
    if N == 2:
        return y
    else:
        return np.concatenate((rvsl(y[0:N:2]), rvsl(y[1:N:2])))

def data_process(array):
    bit_array = numpy_to_bit(array)
    array_len = len(bit_array)
    current_array = []
    for i in range(0, array_len, 512):
        sub_array = bit_array[i:i+512]
        if len(sub_array) < 512:
            padding = np.ones((512 - len(sub_array)), dtype=bit_array.dtype)
            sub_array = np.concatenate((sub_array, padding))
        current_array.append(sub_array)
    current_idx = i // 512 + 1
    return current_array, current_idx, array_len

def numpy_array_to_udp_packet(bit_array):
    # Convert the numpy array of bits to a string of bits
    bit_string = ''.join(str(int(bit)) for bit in bit_array)
    # Convert the bit string to bytes
    byte_array = bytearray(int(bit_string[i:i+8], 2) for i in range(0, len(bit_string), 8))
    return byte_array

def udp_packet_to_numpy_array(packet):
    # Convert the byte array back to a bit string
    bit_string = ''.join(format(byte, '08b') for byte in packet)
    # Convert the bit string to a numpy array of floats
    bit_array = np.array([int(bit) for bit in bit_string], dtype=np.int8)
    return bit_array

def data_generate(bit_array:np.ndarray, data_idx:np.ndarray) -> np.ndarray:
    u=np.zeros(1024,dtype=np.int8)
    u[data_idx] = bit_array
    x = encode(u)
    x = rvsl(x)
    return x

def codeword_generate(array_dict, data_idx):
    split_bit = []
    codeword_idx = []
    bit_array_len = []
    codeword_idx.append(0)
    array_process = partial(data_process)
    with ProcessPoolExecutor() as executor:
        for current_array, current_idx, array_len in executor.map(array_process, [array_dict[name] for name in array_dict]):
            split_bit.extend(current_array)
            codeword_idx.append(codeword_idx[-1] + current_idx)
            bit_array_len.append(array_len)
    executor.shutdown(wait=True)
    del executor, array_process
    time1 = time.time()
    encode_partial = partial(data_generate, data_idx=data_idx)
    with ProcessPoolExecutor() as executor:
        codeword = list(executor.map(encode_partial, split_bit))
    executor.shutdown(wait=True)
    time2 = time.time()
    print(f'Encode time: {time2-time1}')

    del executor, encode_partial
    return np.array(codeword,dtype=np.int8), codeword_idx, bit_array_len

def packet_diffusion(codeword, block_len, packet_idx):
    codeword_len = codeword[0].shape[0]
    udp_packet = []
    for idx, i in enumerate(range(0, codeword_len, block_len)):
        # tmp_packet = np.concatenate([tmp_codeword[i:i+block_len] for tmp_codeword in codeword])
        tmp_packet = codeword[:, packet_idx[i:i+block_len]].flatten()
        tmp_udp_packet = struct.pack("I",idx) + numpy_array_to_udp_packet(tmp_packet)
        udp_packet.append(tmp_udp_packet)
    return udp_packet

def encoder_udp(array_dict, data_idx, block_len, packet_idx):
    codeword, codeword_idx, bit_array_len = codeword_generate(array_dict, data_idx)
    udp_packet = packet_diffusion(codeword, block_len, packet_idx)
    return udp_packet, codeword_idx, bit_array_len



def packet_aggregation(udp_packet, packet_idx, block_len, data_idx, freeze_idx, codeword_idx, bit_array_len):
    sort_idx = [struct.unpack("I", tmp_packet[:4])[0] for tmp_packet in udp_packet]
    packet_data_del = np.array([udp_packet_to_numpy_array(tmp_packet[4:]) for _, tmp_packet in sorted(zip(sort_idx, udp_packet))])
    packet_data = np.ones((int(1024/block_len), len(packet_data_del[0])))*0.5
    for i, tmp_idx in enumerate(sorted(sort_idx)):
        packet_data[tmp_idx] = packet_data_del[i]
    
    restore_codeword = []
    inverse_packet_idx = np.argsort(packet_idx)
    for i in range(0, packet_data.shape[1],block_len):
        tmp_codeword = packet_data[:,i:i+block_len].flatten()
        restore_codeword.append(tmp_codeword[inverse_packet_idx])

    decode_partial = partial(decoding, freeze_idx=freeze_idx, data_idx=data_idx)
    with ProcessPoolExecutor() as executor:
        decoding_data = np.array(list(executor.map(decode_partial, restore_codeword)),dtype=np.int8)
    del executor, decode_partial

    restore_array = []
    for i, array_len in enumerate(bit_array_len):
        tmp_array = np.concatenate(decoding_data[codeword_idx[i]:codeword_idx[i+1]])[:array_len]
        restore_array.append(bit_to_numpy(tmp_array))
    return restore_array


def decoding(bit_array, freeze_idx, data_idx):
    # Prepare the necessary arrays and values
    bit_array = 1-2*bit_array
    lr0 = np.exp(-(bit_array - 1)**2)
    lr1 = np.exp(-(bit_array + 1)**2)
    lr0_post = lr0 / (lr0 + lr1)
    lr1_post = lr1 / (lr0 + lr1)
    delete_num = 1024 - len(bit_array)
    hd_dec = np.zeros(1024, dtype=np.float64)
    frozen_val = np.zeros(len(freeze_idx), dtype=np.float64)
    pro_prun = np.zeros((1, 2 * 1024 + 1), dtype=np.float64)

    # Call the optimized Cython function
    i_scen_sum, hd_dec_result = cython_sc_decoding(
        lr0_post, lr1_post, freeze_idx.astype(np.float64),
        hd_dec, 1024, 10, 512, frozen_val, delete_num, 0, pro_prun
    )

    # Extract the output for data_idx from hd_dec_result
    data_out = hd_dec_result[data_idx]
    return data_out


In [118]:
random_array = np.random.randint(2, size=(1024,))
bool_array = np.array(random_array, dtype=bool)

In [210]:
f=np.array([[1,0],[1,1]], dtype=bool)
n = int(np.log2(1024))
g_n = f
for _ in range(n-1):
    f = np.kron(f, g_n)
range_n = rvsl(np.arange(1024))
f = f[:,range_n]

In [207]:
np.matmul(random_array, f) % 2

array([1, 0, 1, ..., 1, 0, 1])

In [200]:
np.all(rvsl(encode(random_array)) == np.dot(random_array, f) % 2)

True

In [11]:
# Export all weights to numpy arrays
weights_dict = {name: param.cpu().detach().numpy() for name, param in net[0].state_dict().items()}
N = 1024
n = 10
rate = 0.5
K = round(N*rate)
c_1024 = np.load('c_1024.npy')
coding_list = scipy.io.loadmat("1024-3db-d=2-mean.mat")["count_number"]
coding_index = np.argsort(coding_list[:,1])
info_idx = coding_index[:K]
freeze_idx = coding_index[K:]

# sort the final index
info_ni = np.sort(info_idx)
freeze_ni = np.sort(freeze_idx)

udp_packet, codeword_idx, bit_array_len = encoder_udp(weights_dict, info_ni, 8, c_1024)

Encode time: 7.239225625991821


In [64]:
def get_gn(length:int) -> np.ndarray:
    n = int(np.log2(length))
    f=np.array([[1,0],[1,1]], dtype=bool)
    g_n = f
    for _ in range(n-1):
        g_n = np.kron(g_n, f)
    g_order = rvsl(np.arange(length))
    g_n = g_n[:, g_order]
    return g_n

def matrix_process(array, data_idx):
    bit_array = np.frombuffer(array.tobytes(), dtype=np.uint8)
    array_len = bit_array.shape[0] * 8
    if bit_array.shape[0] % 64 != 0:
        padding = 255 * np.ones((64 - bit_array.shape[0] % 64), dtype=bit_array.dtype)
        bit_array = np.concatenate((bit_array, padding))
    bit_array = np.unpackbits(bit_array)
    bit_array = bit_array.reshape(-1, 512)
    current_array = np.zeros((bit_array.shape[0], 1024), dtype=np.uint8)
    current_array[:, data_idx] = bit_array
    current_idx = bit_array.shape[0]
    return current_array, current_idx, array_len

def matrix_packet_diffusion(codeword):
    if codeword.shape[0] % 8 != 0:
        padding = np.ones((8 - codeword.shape[0] % 8, 1024), dtype=codeword.dtype)
        udp_numpy = np.concatenate((codeword, padding)).T
    udp_numpy = np.packbits(udp_numpy.flatten()).reshape(1024,-1)
    udp_packet = [struct.pack("I", 0) + struct.pack("I", idx) + udp_numpy[idx].tobytes() for idx in range(udp_numpy.shape[0])]
    return udp_packet
    

def matrix_encode(array_dict, data_idx):
    split_bit = []
    codeword_idx = []
    bit_array_len = []
    codeword_idx.append(0)

    for name in array_dict:
        current_array, current_idx, array_len = matrix_process(array_dict[name], data_idx)
        split_bit.extend(current_array)
        codeword_idx.append(codeword_idx[-1] + current_idx)
        bit_array_len.append(array_len)
    split_bit = torch.tensor(np.array(split_bit), dtype=torch.float32).to("cuda:0")
    
    f=np.array([[1,0],[1,1]], dtype=np.int8)
    n = int(np.log2(1024))
    g_n = f
    
    for _ in range(n-1):
        f = np.kron(g_n,f)
    g_order = rvsl(np.arange(1024))
    f = f[:,g_order]
    f = torch.tensor(f, dtype=torch.float32).to("cuda:0")
    
    time1 = time.time()
    # codeword = np.matmul(split_bit, f) % 2
    tmp_codeword = torch.matmul(split_bit, f) % 2
    time2 = time.time()
    codeword = np.array(tmp_codeword.detach().cpu().numpy(), dtype=np.int8)
    print(f'Encode time: {time2-time1}')
    print(codeword.shape)
    return codeword, codeword_idx, bit_array_len

def matrix_udp(array_dict, data_idx):
    codeword, codeword_idx, bit_array_len = matrix_encode(array_dict, data_idx)
    udp_packet = matrix_packet_diffusion(codeword)
    return udp_packet, codeword_idx, bit_array_len, codeword.shape[0]

def matrix_udp_numpy(packet, codeword_num):
    bit_array = np.frombuffer(packet, dtype=np.uint8)
    bit_array = np.unpackbits(bit_array)
    bit_array = bit_array[:codeword_num]
    return bit_array

def matrix_packet_aggregation(udp_packet, codeword_num, data_idx, freeze_idx, codeword_idx, bit_array_len):
    # sort_idx = [struct.unpack("I", tmp_packet[:4])[0] for tmp_packet in udp_packet]
    # packet_data_del = np.array([matrix_udp_numpy(tmp_packet[4:], codeword_num) for _, tmp_packet in sorted(zip(sort_idx, udp_packet))])
    udp_idx = []
    packet_del = []
    for tmp_packet in udp_packet:
        udp_idx.append(struct.unpack("I", tmp_packet[:4])[0])
        packet_del.append(matrix_udp_numpy(tmp_packet[4:], codeword_num))
    packet_del = np.array(packet_del)
    packet_data = np.ones((1024, codeword_num)) * 0.5
    packet_data[udp_idx] = packet_del

    restore_codeword = packet_data.T
    decode_partial = partial(decoding, freeze_idx=freeze_idx, data_idx=data_idx)
    with ProcessPoolExecutor() as executor:
        decoding_data = np.array(list(executor.map(decode_partial, restore_codeword)),dtype=np.int8)
    del executor, decode_partial
    restore_array = []
    for i, array_len in enumerate(bit_array_len):
        tmp_array = np.concatenate(decoding_data[codeword_idx[i]:codeword_idx[i+1]])[:array_len]
        bit_array = np.packbits(tmp_array)
        restore_array.append(np.frombuffer(bit_array.tobytes(), dtype=np.float32))
    return restore_array


# tmp_a, tmp_b, tmp_c = matrix_encode(weights_dict, info_ni)
weights_dict = {name: param.cpu().detach().numpy() for name, param in net[0].state_dict().items()}
# para_list = ["mlp_head.0.weight", "mlp_head.0.bias", "mlp_head.1.weight", "mlp_head.1.bias"]
# para_dict = {name: weights_dict[name] for name in para_list}
# bn_dict = {name: weights_dict[name] for name in weights_dict if "norm" not in name}

tmp_a, tmp_b, tmp_c, tmp_d = matrix_udp(weights_dict, info_ni)
tmp_restore = matrix_packet_aggregation(tmp_a, tmp_d, info_ni, freeze_ni, tmp_b, tmp_c)

Encode time: 0.00016832351684570312
(26103, 1024)


In [51]:
len(tmp_a[0])

3265

In [52]:
total_size = 0
transfer_size = 0
ferp_size = 0
for name in bn_dict:
    total_size += bn_dict[name].nbytes
    transfer_size += bn_dict[name].nbytes + 4 + 4
for i in tmp_a:
    ferp_size += len(i)

print(f'Total Size: {total_size/1024} KB')
print(f'Transfer Size: {transfer_size/1024} KB')
print(f'Ferp Size: {ferp_size/1024} KB')

Total Size: 1628.0625 KB
Transfer Size: 1628.515625 KB
Ferp Size: 3265.0 KB


In [34]:
weights_dict['cls_token'].nbytes

128

In [466]:
a_i = np.ones((1024,1000)) * 0.5
b_i = np.zeros((3, 1000))
b_idx = [3,6,9]
a_i[b_idx] = b_i

In [468]:
a_i[b_idx]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [465]:
len(tmp_a[0]*8)

26136

In [460]:
tmp_d, tmp_e, tmp_f = codeword_generate(weights_dict, info_ni)

Encode time: 6.217371940612793


In [397]:
tmp_c == tmp_f

True

In [448]:
if tmp_a.shape[0] % 8 != 0:
    padding = np.ones((8 - tmp_a.shape[0] % 8, 1024), dtype=tmp_a.dtype)
    udp_numpy = np.concatenate((tmp_a, padding)).T
udp_numpy = np.packbits(udp_numpy.flatten()).reshape(1024,-1)
tmp_array = [udp_numpy[i].tobytes() for i in range(udp_numpy.shape[0])]

In [469]:
tmp_array[0]

b'\xa8S\xdc\xc0q5\xd8Z\xd9\xea\x12z\x820"\xd4\xd5\xe0\x95s\xfd(\x7f\xb3\'\xa5w\xd4K\xa4\xa7\xfd\x07bN\xa9\xfe\x0e\xa2\xf6G\xb1\x9b~N\xdf&D\xc3\xde\x86\xdf\xbcJ\xed@]WNV\x1dO\xd5\x9d\xaa\x13\xe5nzQQ\x91V\xbaq\x9b\xcd\xa7U\xbe\xc6\x16\x1f^\xaeg\xaf\xcb\xa2t,\x08\x15e\xab\xa1\x80H\x84u\x80\xe6\xde\xe8\xed\xc4\xa9\x94\x97\xa6\x8c\xf6\xc0\x98\xe5_\xaf\xf3\xe3\xb3x\xb7\xc8\x14\xc13\xf8e\xa2x\xfc\x8be\xa0G\xb48\x9aCN^r\xcc\x13\x8f\xdd\x8c\xbbp/\xc7R\xc4<0q\x95\x00*\xbf+\x8dJ]\xaf\x13<cJ\x04jP\xca5f\xdb\xb6\x96\x17-n2QVlU9\xee\x85X\xb7\x16\xb3G\xbf\xfb\x7f\xfdF9\x8b\xd2w\xdd\x1d\xe6\x08:G\x11\xe1\xbe\x8f~\xca\x84rO\xc1\xc4\x96\xf7\xab+;.\x8czM~{I{\xec\xb0\xe6\xb4_\xca#\x046\xbd\x9a\xa7_\x193\xfd\x04k\x10P\\\x80:\xc1=bN\x0by\xc8\xc2\xfcA\xa3\xce\xc0\xd5\x1f\x81\x82#\x95\x839&\xad\x83P\x89p-V\x88\x8d\xce@\x02\x81\xb2{}=\xf4\xd0\x7f\xbc\xdc\x8ce\xad\xb9vL\x80g\xa4Q\nu\xe0\x7f\xcd\x95t\x1a\xf6q\xb5\xfc\x08\xe5~\xb9p*Q\xb2\x00m\x92\x82\xd3\xd6\\\xcc\t\xa3q0\xd1\x17\x85Tip\xebz\xd0I\xe4\x8e\x0bJ3\xa

In [436]:
(np.frombuffer(tmp_array[0], dtype=np.uint8) == udp_numpy[0]).all()

True

In [471]:
np.frombuffer(tmp_array[0], dtype=np.uint8)

array([168,  83, 220, ..., 132, 176, 215], dtype=uint8)

In [372]:
tmp_a = weights_dict['transformer.layers.0.0.fn.temperature'].tobytes()
bit_array = np.frombuffer(tmp_a, dtype=np.uint8)
if bit_array.shape[0] % 64 != 0:
    padding = np.ones((64 - bit_array.shape[0] % 64), dtype=bit_array.dtype) * 255
    bit_array = np.concatenate((bit_array, padding))
bit_array = np.unpackbits(bit_array)

In [340]:
# Convert bytes to numpy array of bool
bit_array = np.frombuffer(tmp_a, dtype=np.uint8)
bit_array = torch.from_numpy(np.unpackbits(bit_array))
# bool_array = bit_array.astype(bool)

In [347]:
bit_array.view(-1,512).shape

torch.Size([130, 512])

In [256]:
for _ in range(500):
    bit_array = numpy_to_bit(weights_dict['pos_embedding'])
    array_len = len(bit_array)

    # Calculate the number of rows in the resulting matrix
    num_rows = (array_len + 511) // 512  # Round up to cover all elements

    # Create a matrix of size (num_rows, 512) initialized with ones (for padding)
    matrix = np.ones((num_rows, 512), dtype=bit_array.dtype)

    # Fill in the matrix with slices of bit_array
    matrix[:array_len // 512, :] = bit_array[:array_len].reshape(-1, 512)

    # For the last partial row, copy the remaining elements
    remaining_elements = array_len % 512
    if remaining_elements > 0:
        matrix[array_len // 512, :remaining_elements] = bit_array[-remaining_elements:]

In [None]:
recv_packet = {i:[] for i in range(num_nodes)}
for tmp_id in range(num_nodes):
    for i in range(len(udp_packet)):
        tmp_packet = struct.pack('I',0) + udp_packet[i]
        node_s[0].sendall(struct.pack('I',len(tmp_packet))+tmp_packet)
        if (i+1) % 16 == 0:
            try:
                while True:
                    server_r.settimeout(3)
                    data = server_r.recv(len(tmp_packet))
                    server_r.settimeout(0.5)
                    node_id = struct.unpack('I',data[:4])[0]
                    recv_packet[node_id].append(data[4:])
                    # recv_packet.append(data)
            except socket.timeout:
                # print('Timeout')
                # print(len(recv_packet[0]))
                server_r.settimeout(None)

In [477]:
restored_array = packet_aggregation(udp_packet, c_1024, 8, info_ni, freeze_ni, codeword_idx, bit_array_len)

In [483]:
len(tmp_restore)

82

In [33]:
restored_dict = {}
# restored_array = [torch.tensor(arr).to(device) for arr in restored_array]
for i, name in enumerate(weights_dict):
    restored_dict[name] = torch.tensor(restored_array[i].reshape(weights_dict[name].shape)).to(device)

In [484]:
for i in range(len(restored_array)):
    assert np.array_equal(restored_array[i], list(weights_dict.values())[i].flatten())
# np.array_equal(restored_array[1], list(weights_dict.values())[1].flatten())

In [485]:
for i in range(len(restored_array)):
    assert np.array_equal(restored_array[i], tmp_restore[i])