In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import pickle
import pandas as pd
import numpy as np

use_cuda = torch.cuda.is_available()
# use_mps = torch.backends.mps.is_available()
if use_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [2]:
with open('word2index.pickle', 'rb') as handle:
    word2index = pickle.load(handle)

In [3]:
train_df=pd.read_pickle('../data/abstracts_data/20k_abstracts/processed_train.pickle')
test_df=pd.read_pickle('../data/abstracts_data/20k_abstracts/processed_test.pickle')

In [4]:
def label_map(label):
    if label == "RESULTS":
        return 0
    elif label == "METHODS":
        return 1
    elif label == "CONCLUSIONS":
        return 2
    elif label == "BACKGROUND":
        return 3
    else: #positive
        return 4

seq_length = 194
def encode_and_pad(tweet, length):
    sos = [word2index["<SOS>"]]
    eos = [word2index["<EOS>"]]
    pad = [word2index["<PAD>"]]

    if len(tweet) < length - 2: # -2 for SOS and EOS
        n_pads = length - 2 - len(tweet)
        encoded = [word2index[w] for w in tweet]
        return sos + encoded + eos + pad * n_pads 
    else: # tweet is longer than possible; truncating
        encoded = [word2index[w] for w in tweet]
        truncated = encoded[:length - 2]
        return sos + truncated + eos

In [5]:
train_set = [(label, tokens) for label, tokens in zip(train_df['target'], train_df['text_tokens'])]
test_set = [(label, tokens) for label, tokens in zip(test_df['target'], test_df['text_tokens'])]

In [6]:
# train_set

In [6]:
train_encoded = [(encode_and_pad(tweet, seq_length), label_map(label)) for label, tweet in train_set]
test_encoded = [(encode_and_pad(tweet, seq_length), label_map(label)) for label, tweet in test_set]

In [7]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(_label)
#         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(_text)
        offsets.append(_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [17]:
batch_size = 50

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
test_x = np.array([tweet for tweet, label in test_encoded])
test_y = np.array([label for tweet, label in test_encoded])

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_ds = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))


# train_dl = DataLoader(train_ds, shuffle=False, batch_size=batch_size, drop_last=True, collate_fn=collate_batch)
# test_dl = DataLoader(test_ds, shuffle=False, batch_size=batch_size, drop_last=True, collate_fn=collate_batch)

train_dl = DataLoader(train_ds, shuffle=False, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=False, batch_size=batch_size, drop_last=True)

for text, label in train_dl:
    print(text)
    break

tensor([[  1,   3,   4,  ...,   0,   0,   0],
        [  1,  31,  32,  ...,   0,   0,   0],
        [  1,  44,  45,  ...,   0,   0,   0],
        ...,
        [  1, 395, 396,  ...,   0,   0,   0],
        [  1, 432, 369,  ...,   0,   0,   0],
        [  1, 434, 357,  ...,   0,   0,   0]])


## Test Crypten

In [22]:
# !pip install --user crypten

In [9]:
from tqdm.notebook import tqdm
import crypten.mpc as mpc
import crypten.communicator as comm
import crypten
import time
crypten.init()
torch.set_num_threads(1)

In [18]:
# # @mpc.run_multiprocess(world_size=2)
# def get_time_elapsed_crypten(device, test_loader):
#     plaintext_model = torch.load('./models/pt_text_classification_model_'+f'{torch.cuda.get_device_name(0)}.pth').to(device)
#     dummy_input = text.to(device)

#     private_model = crypten.nn.from_pytorch(plaintext_model, dummy_input)
#     private_model.encrypt(src=0)
#     private_model.eval()
    
#     test_loss = 0
#     correct = 0
#     with torch.no_grad():
#         t0 = time.perf_counter()
#         for data, target in test_loader:
#             target = target
#             data_enc = crypten.cryptensor(data)
# #             print(data_enc)
#             output, hidden = private_model(data_enc)
#             output = output_enc.get_plain_text()
# #             test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
#             preds = torch.argmax(F.softmax(output, dim=1),1)
#             correct += pred.eq(target.view_as(pred)).sum().item()
#         time_elapsed = time.perf_counter() - t0

#     test_loss /= len(test_loader.dataset)

# #     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
# #         test_loss, correct, len(test_loader.dataset),
# #         100. * correct / len(test_loader.dataset)))
#     print('Time Elapsed:{}'.format(time_elapsed))
#     return time_elapsed

# time=get_time_elapsed_crypten(device, test_dl)



ValueError: Cannot perform Gather operation using encrypted indices.

In [28]:
plaintext_model = torch.load('./models/pt_text_classification_model_'+f'{torch.cuda.get_device_name(0)}.pth').to('cpu')
dummy_input = torch.zeros((1,194), dtype=torch.long)

private_model = crypten.nn.from_pytorch(plaintext_model, dummy_input)
private_model.encrypt(src=0)
private_model.eval()

test_dl = DataLoader(test_ds, shuffle=False, batch_size=1, drop_last=True)

test_loss = 0
correct = 0
with torch.no_grad():
    t0 = time.perf_counter()
    for data, target in test_dl:
        target = target
        print(data.shape)
        data_enc = crypten.cryptensor(data)
        print(data_enc.shape)
        output, hidden = private_model(data_enc.flatten())
        output = output_enc.get_plain_text()
#             test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
        preds = torch.argmax(F.softmax(output, dim=1),1)
        correct += pred.eq(target.view_as(pred)).sum().item()
    time_elapsed = time.perf_counter() - t0

test_loss /= len(test_loader.dataset)



torch.Size([1, 194])


TypeError: CrypTensor class cannot be instantiated directly.

In [23]:
print("Model successfully encrypted:", private_model.encrypted)

Model successfully encrypted: True


In [24]:
list(private_model.named_modules())

[('', Graph encrypted module),
 ('embedding.weight', Parameter encrypted module),
 ('fc.weight', Parameter encrypted module),
 ('fc.bias', Parameter encrypted module),
 ('4', Gather encrypted module),
 ('5', Constant encrypted module),
 ('6', Gather encrypted module),
 ('output', Gemm encrypted module)]

In [25]:
plaintext_model.embedding.state_dict()['weight'].shape

torch.Size([91942, 64])

In [26]:
plaintext_model

TextClassificationModelv2(
  (embedding): Embedding(91942, 64)
  (fc): Linear(in_features=64, out_features=5, bias=True)
)