In [None]:
# %%

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import SGD
from tqdm import tqdm
import json
from torch.utils.data import TensorDataset
import random

from CharRNN import CharRNN

import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrickymoron[0m ([33mriccardo-moroni[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# %% 

def data_pruning(json_train_path, json_test_path, crop_amount=2000, n_clients=100):
    """
    Reduces the dimension of LEAF dataset.
    Samples 'n_clients' clients among those having at least 'crop_amount' training samples.
    Each client is given 'crop_amount' number of contigous training samples

    Returns:
        - 4 dictionaries (X_train, Y_train, X_test, Y_test) 
    """
    rand_seed=0
    with open(json_train_path) as train_json_data:
        train_dict = json.load(train_json_data)
    with open(json_test_path) as test_json_data:
        test_dict = json.load(test_json_data)

    users_complete = train_dict['users']

    X_train_cropped, Y_train_cropped, X_test_cropped, Y_test_cropped = {}, {}, {}, {}

    i=0
    for k in train_dict['user_data'].keys():
        if train_dict['num_samples'][i] > crop_amount:
            np.random.seed(rand_seed)
            start = np.random.randint(len(train_dict['user_data'][k]['x'])-crop_amount)
            X_train_cropped[k] = train_dict['user_data'][k]['x'][start:start+crop_amount]
            Y_train_cropped[k] = train_dict['user_data'][k]['y'][start:start+crop_amount]
            X_test_cropped[k] = test_dict['user_data'][k]['x'][start:start+crop_amount]
            Y_test_cropped[k] = test_dict['user_data'][k]['y'][start:start+crop_amount]
            rand_seed+=1
            i+=1
        else:
            i+=1

    users_selected = random.sample(list(X_train_cropped.keys()), n_clients)

    X_train = {key: X_train_cropped[key] for key in users_selected}
    Y_train = {key: Y_train_cropped[key] for key in users_selected}
    X_test = {key: X_test_cropped[key] for key in users_selected}
    Y_test = {key: Y_test_cropped[key] for key in users_selected}

    return X_train, Y_train, X_test, Y_test


def concat_dict_values(my_dict):
    concat = []
    for v in my_dict.values():
        if isinstance(v, list):
            concat.extend(v)
        else:
            concat.append(v)
    return concat

json_train_path = '../../datasets/shakespeare/train/all_data_niid_0_keep_0_train_9.json'
json_test_path = '../../datasets/shakespeare/test/all_data_niid_0_keep_0_test_9.json'

X_train_pruned, Y_train_pruned, X_test_pruned, Y_test_pruned = data_pruning(json_train_path, json_test_path)
X_train = concat_dict_values(X_train_pruned)
Y_train = concat_dict_values(Y_train_pruned)
X_test = concat_dict_values(X_test_pruned)
Y_test = concat_dict_values(Y_test_pruned)

In [None]:
# %%

train_sentence = ' '.join(X_train)
vocab_train = sorted(set(train_sentence))
vocab_train.append('<OOV>')

char_to_idx = {char: idx for idx, char in enumerate(vocab_train)}

In [None]:
# %%

# create a ./tensors/ folder in which to save the (encoded) tensors 

def tokenize_encode(my_list, char_to_idx):
    oov_token = len(vocab_train)-1
    new_list = []
    for sentence in tqdm(my_list):
        characters = list(sentence)

        encoded = []
        for char in characters:
            if char in char_to_idx:
                encoded.append(char_to_idx[char])
            else:
                encoded.append(oov_token)
    
        new_list.append(encoded)
    return new_list

X_train_enc = np.array(tokenize_encode(X_train, char_to_idx))
Y_train_enc = np.array(tokenize_encode(Y_train, char_to_idx)).flatten()
X_test_enc = np.array(tokenize_encode(X_test, char_to_idx))
Y_test_enc = np.array(tokenize_encode(Y_test, char_to_idx)).flatten()

X_train_tensor = torch.tensor(X_train_enc, dtype=torch.long)
Y_train_tensor = torch.tensor(Y_train_enc, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_enc, dtype=torch.long)
Y_test_tensor = torch.tensor(Y_test_enc, dtype=torch.long)

100%|██████████| 200000/200000 [00:01<00:00, 137621.36it/s]
100%|██████████| 200000/200000 [00:00<00:00, 853065.19it/s]
100%|██████████| 3584/3584 [00:00<00:00, 187025.80it/s]
100%|██████████| 3584/3584 [00:00<00:00, 677103.98it/s]


In [None]:
# %%

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

# training parameters
train_params = {
    'batch_size' : 100,
    'lr' : 1e-1,
    'epochs' : 10,
    'momentum': 0.9,
}

train_loader = DataLoader(train_dataset, train_params['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, train_params['batch_size'])

In [None]:
# %%

# model parameters
model_params = {
    'vocab_size' : len(vocab_train),
    'embed_dim' : 8,
    'lstm_units' : 256,
}

all_params = train_params.copy()
all_params.update(model_params)
wandb.init(
    project='fl',
    name=f'centralized_shakespeare',
    config= all_params
)

model = CharRNN(vocab_size = model_params['vocab_size'], embed_dim = model_params['embed_dim'], lstm_units=model_params['lstm_units']).cuda()
model.to('cuda')
print(model)

criterion = nn.CrossEntropyLoss().cuda()
optimizer = SGD(model.parameters(), lr=train_params['lr'], momentum=train_params['momentum'], weight_decay=4e-4)

def train(model):
    accuracies, losses = [], []
    for t in range(train_params['epochs']):
        model.train()
        for batch_idx, (inputs, targets) in enumerate(tqdm(train_loader)):
            inputs, targets = inputs.cuda(), targets.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        torch.save(model.state_dict(), f"./saved_models/{train_params['epochs']}epochs_weights.pt")

        # test (after each single epoch)
        acc, loss = test(model)
        wandb.log({'acc': acc, 'loss': loss, 'epoch': t})
        accuracies.append(acc)
        losses.append(loss)

    return accuracies, losses

def test(model):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        progress_bar = tqdm(enumerate(test_loader), total=len(test_loader), desc=f"Testing...")
        for batch_idx, (inputs, targets) in progress_bar: 
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    test_loss = test_loss / len(test_loader)
    test_accuracy = 100. * correct / total
    print(f'Test Loss: {test_loss:.6f} Acc: {test_accuracy:.2f}%')
    return test_accuracy, test_loss

CharRNN(
  (embedding): Embedding(69, 8)
  (stacked_lstm): LSTM(8, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=69, bias=True)
)


In [None]:
# %%

accuracies, losses = train(model)

100%|██████████| 2000/2000 [00:32<00:00, 60.71it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 149.37it/s]


Test Loss: 2.208349 Acc: 38.62%


100%|██████████| 2000/2000 [00:33<00:00, 59.90it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 160.38it/s]


Test Loss: 2.006232 Acc: 43.00%


100%|██████████| 2000/2000 [00:33<00:00, 59.07it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 112.89it/s]


Test Loss: 1.925065 Acc: 44.08%


100%|██████████| 2000/2000 [00:33<00:00, 59.26it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 167.53it/s]


Test Loss: 1.906087 Acc: 45.06%


100%|██████████| 2000/2000 [00:33<00:00, 59.44it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 143.67it/s]


Test Loss: 1.866817 Acc: 46.07%


100%|██████████| 2000/2000 [00:34<00:00, 58.40it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 113.67it/s]


Test Loss: 1.845212 Acc: 46.65%


100%|██████████| 2000/2000 [00:34<00:00, 58.43it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 154.36it/s]


Test Loss: 1.821997 Acc: 47.35%


100%|██████████| 2000/2000 [00:34<00:00, 58.77it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 149.93it/s]


Test Loss: 1.785317 Acc: 47.80%


100%|██████████| 2000/2000 [00:33<00:00, 58.89it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 150.61it/s]


Test Loss: 1.802861 Acc: 47.41%


100%|██████████| 2000/2000 [00:32<00:00, 60.80it/s]
Testing...: 100%|██████████| 36/36 [00:00<00:00, 125.20it/s]

Test Loss: 1.789084 Acc: 47.49%



