### Imports and training Configurations

In [1]:
import torch
import syft as sy
import argparse
import numpy as np
import pandas as pd

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torchvision import transforms
from torchvision import datasets, transforms

import time
print("torch version:", torch.__version__)
print("syft version:", sy.__version__)

torch version: 1.3.0
syft version: 0.2.0a2


In [2]:
def getSamples(filename):
    data = pd.read_csv(filename, sep='\t')
    return data.to_numpy()[:,1:].transpose()

dim = 12634
data1 = getSamples("GSE2034-Normal-train.txt")
data2 = getSamples("GSE2034-Tumor-train.txt")

data1Label = np.zeros(len(data1)).reshape((-1, 1))
data2Label = np.ones(len(data2)).reshape((-1, 1))
x = np.concatenate((data1, data2))
y = np.concatenate((data1Label, data2Label))

# shuffle the data
idx = np.random.permutation(len(x))
x,y = x[idx], y[idx]

z = np.concatenate((x, y), axis = 1)

# We follow an 80/20 partitioning for the training and testing sets
n_train_items = 181
n_test_items = 46

# partition the data into training data and test data
x_train = x[:n_train_items]
y_train = y[:n_train_items]

x_train = x_train.reshape((-1,1,dim))
y_train = y_train.reshape((-1,1))

x_test = x[n_train_items:]
y_test = y[n_train_items:]    

x_test = x_test.reshape((-1,1,dim))
y_test = y_test.reshape((-1,1))

### Splitting 80 20 for Train and Test

In [3]:
df_normal = pd.read_csv("GSE2034-Normal-train.txt", sep='\t')

df_train_normal = df_normal.iloc[:, 1:].sample(frac=0.8,random_state=200, axis=1)

df_test_normal = df_normal.drop(df_train_normal.columns, axis=1)
df_test_normal.to_csv("GSE2034-Normal_test.txt", sep='\t')

df_train_normal = pd.concat((df_test_normal['Hybridization REF'], df_train_normal), axis=1)
df_train_normal.to_csv("GSE2034-Normal_train.txt", sep='\t')

In [4]:
df_tumor = pd.read_csv("GSE2034-Tumor-train.txt", sep='\t')

df_train_tumor = df_tumor.iloc[:, 1:].sample(frac=0.8,random_state=200, axis=1)

df_test_tumor = df_tumor.drop(df_train_tumor.columns, axis=1)
df_test_tumor.to_csv("GSE2034-Tumor_test.txt", sep='\t')

df_train_tumor = pd.concat((df_test_tumor['Hybridization REF'], df_train_tumor), axis=1)
df_train_tumor.to_csv("GSE2034-Tumor_train.txt", sep='\t')

### Dataloader

In [5]:
from torch.utils.data import Dataset, DataLoader

class GSE2034Dataset(Dataset):
    """GSE2034 dataset."""

    def __init__(self, normal_csv_file, tumor_csv_file):
        """
        Args:
            normal_csv_file (string): Path to the csv file with normal.
            tumor_csv_file (string): Path to the csv file with tumor.
                """
        df_normal = pd.read_csv(normal_csv_file, sep='\t')
        df_normal.index = df_normal['Hybridization REF']
        del df_normal['Hybridization REF']
        df_normal = df_normal.T
        df_normal['y'] = 0
        
        df_tumor = pd.read_csv(tumor_csv_file, sep='\t')
        df_tumor.index = df_tumor['Hybridization REF']
        del df_tumor['Hybridization REF']
        df_tumor = df_tumor.T
        df_tumor['y'] = 1

        self.df = pd.concat([df_normal, df_tumor])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return (torch.tensor(self.df.iloc[idx].iloc[:-1].values)
                .float()
                .unsqueeze(0), #add channel dimension
                torch.tensor(self.df.iloc[idx].y))

In [6]:
# Load the training data
trainset = GSE2034Dataset("GSE2034-Normal_train.txt",
                                    "GSE2034-Tumor_train.txt")
trainloader = torch.utils.data.DataLoader(trainset, batch_size=20, shuffle=True)

In [7]:
# Load the test data
testset = GSE2034Dataset("GSE2034-Normal_test.txt",
                                    "GSE2034-Tumor_test.txt")
testloader = torch.utils.data.DataLoader(testset, batch_size=20, shuffle=True)

In [19]:
epochs = 30
steps = 0

train_losses, test_losses = [], []
for e in range(epochs):
    running_loss = 0
    for genes, labels in trainloader:
        
        optimizer.zero_grad()
        
        output = model(genes.cuda())
        loss = criterion(output, labels.cuda())
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    else:
        test_loss = 0
        accuracy = 0
        
        # Turn off gradients for validation, saves memory and computations
        with torch.no_grad():
            for genes, labels in testloader:
                output = model(genes.cuda())
                test_loss += criterion(output, labels.cuda())
                
#                 ps = torch.exp(log_ps)
#                 top_p, top_class = ps.topk(1, dim=1)
#                 equals = top_class == labels.view(*top_class.shape)
#                 accuracy += torch.mean(equals.type(torch.FloatTensor))
                
        train_losses.append(running_loss/len(trainloader))
        test_losses.append(test_loss/len(testloader))

        print("Epoch: {}/{}.. ".format(e+1, epochs),
              "Training Loss: {:.3f}.. ".format(running_loss/len(trainloader)),
              "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
#               "Test Accuracy: {:.3f}".format(accuracy/len(testloader))
             )

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 1/30..  Training Loss: 0.655..  Test Loss: 1.291.. 
Epoch: 2/30..  Training Loss: 0.531..  Test Loss: 1.236.. 
Epoch: 3/30..  Training Loss: 0.363..  Test Loss: 1.117.. 
Epoch: 4/30..  Training Loss: 0.259..  Test Loss: 1.006.. 
Epoch: 5/30..  Training Loss: 0.194..  Test Loss: 1.133.. 
Epoch: 6/30..  Training Loss: 0.170..  Test Loss: 1.004.. 
Epoch: 7/30..  Training Loss: 0.150..  Test Loss: 1.100.. 
Epoch: 8/30..  Training Loss: 0.144..  Test Loss: 1.075.. 
Epoch: 9/30..  Training Loss: 0.142..  Test Loss: 1.136.. 
Epoch: 10/30..  Training Loss: 0.141..  Test Loss: 1.742.. 
Epoch: 11/30..  Training Loss: 0.140..  Test Loss: 1.061.. 
Epoch: 12/30..  Training Loss: 0.140..  Test Loss: 1.120.. 
Epoch: 13/30..  Training Loss: 0.140..  Test Loss: 1.730.. 
Epoch: 14/30..  Training Loss: 0.140..  Test Loss: 1.121.. 
Epoch: 15/30..  Training Loss: 0.139..  Test Loss: 1.063.. 
Epoch: 16/30..  Training Loss: 0.139..  Test Loss: 1.143.. 
Epoch: 17/30..  Training Loss: 0.139..  Test Loss

In [None]:
gse2034dataset = GSE2034Dataset("GSE2034-Normal-train.txt",
                                    "GSE2034-Tumor-train.txt")


In [None]:
len(gse2034dataset)

In [None]:
dataloader = DataLoader(gse2034dataset, batch_size=30,
                        shuffle=True)

In [None]:
for i_batch, (x,y) in enumerate(dataloader):
    print(i_batch, x.to(device), y)

In [None]:
import torch.optim as optim
criterion = nn.BCELoss() # Binary Cross Entropy 
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
for i_batch, (x,y) in enumerate(dataloader):
#     print(i_batch, sample_batched)    
    # zero the parameter gradients  
    optimizer.zero_grad()
    print(x.shape)

    # forward + backward + optimize
    outputs = net(x.cuda())
    loss = criterion(outputs, y.cuda())
    loss.backward()
    optimizer.step()
    
    print(i_batch, "% Trained", "loss = ", loss.item())

In [None]:
# get the data into (batch, channel = 1, length=dim)
data_torch = torch.from_numpy(x).view([-1, 1, dim]).float()
label_torch = torch.from_numpy(y).view([-1,1,1]).float()

### Deep Learning Model applied to GSE2034

In [14]:
dim = 12634

class Res1d(nn.Module):
    # the conv layers
    def __init__(self, inSize, outSize, kernel=(3,), strides=1,):
        super(Res1d, self).__init__()
        
        # Left , kernel size 3
        # hard-coded to do the padding correctly
        if inSize in (16,64,128,512) and strides > 1:
            pding = 0
        else:
            pding = 1
            
        self.l = nn.Sequential(
            nn.Conv1d(inSize, outSize, kernel, stride=strides, padding=pding, bias=False),
            nn.InstanceNorm1d(outSize)
        )    
        
        # Right, kernel size 1
        if inSize != outSize or strides > 1:
            if strides > 1:
                self.r1 = nn.AvgPool1d(strides)
            else:
                self.r1 = nn.Identity()
                
        self.r = nn.Sequential(
            self.r1,
            nn.Conv1d(inSize, outSize, 1, bias=False),
            nn.InstanceNorm1d(outSize)
        )
    
        self.relu = nn.ReLU()
        
    def forward(self, x):          
        x = self.l(x) + self.r(x)
        return self.relu(x)
    


class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        self.l = nn.Sequential(
            nn.Linear(dim, 64),
            nn.ReLU(),
            nn.Flatten()
        )

        self.r = nn.Sequential(
            Res1d(1, 4, 3),

            Res1d(4, 8, 3),
            Res1d(8, 8, 3, strides=2),

            Res1d(8, 16, 3),
            Res1d(16, 16, 3, strides=2),

            Res1d(16, 32, 3),
            Res1d(32, 32, 3, strides=2),

            Res1d(32, 64, 3),
            Res1d(64, 64, 3, strides=2),

            Res1d(64, 128, 3),
            Res1d(128, 128, 3, strides=2),

            Res1d(128, 256, 3),
            Res1d(256, 256, 3, strides=2),

            Res1d(256, 512, 3),
            Res1d(512, 512, 3, strides=2),

            Res1d(512, 1024, 3),
            Res1d(1024, 1024, 3, strides=2),

            nn.Flatten()
        )
        
        # size is by experiment and hardcode
        self.last = nn.Sequential(
            nn.Linear(50240,32),
            nn.ReLU(),
            nn.Linear(32,1),
            nn.Sigmoid(),
            nn.Flatten()
        )
    
    def forward(self, x):
        # shape is (batch, channel, time)
        l = self.l(x)
#         l.squeeze_(-2)
        r = self.r(x)
#         r.unsqueeze_(-2) # add channel dimension
        y = torch.cat((l,r),dim=-1)
        y = self.last(y)
        return y


In [15]:
device = torch.device("cuda")
model = Model().to(device)

from torchsummary import summary
summary(model,(1,12634))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 64]         808,640
              ReLU-2                [-1, 1, 64]               0
           Flatten-3                   [-1, 64]               0
            Conv1d-4             [-1, 4, 12634]              12
    InstanceNorm1d-5             [-1, 4, 12634]               0
          Identity-6             [-1, 1, 12634]               0
          Identity-7             [-1, 1, 12634]               0
            Conv1d-8             [-1, 4, 12634]               4
    InstanceNorm1d-9             [-1, 4, 12634]               0
             ReLU-10             [-1, 4, 12634]               0
            Res1d-11             [-1, 4, 12634]               0
           Conv1d-12             [-1, 8, 12634]              96
   InstanceNorm1d-13             [-1, 8, 12634]               0
         Identity-14             [-1, 4

In [16]:
import torch.optim as optim
criterion = nn.BCELoss() # Binary Cross Entropy 
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

### Training on plain data

In [None]:
test_inputs = torch.from_numpy(x_test).view([-1, 1, dim]).float().cuda()
test_labels = torch.from_numpy(y_test).view([-1, 1]).float().cuda()


In [None]:
print(test_inputs.shape)

In [None]:
test_inputs = torch.from_numpy(x_test).view([-1, 1, dim]).float().cuda()
test_labels = torch.from_numpy(y_test).view([-1, 1]).float().cuda()
for batch in range(100):  # loop over the dataset multiple times
    # get the inputs; data is a list of [inputs, labels]
    indices = np.random.choice(len(x_train), size=(30))
    inputs = x_train[indices]
    labels = y_train[indices]
    
    inputs = torch.from_numpy(inputs).float().cuda()
    labels = torch.from_numpy(labels).float().cuda()
    
    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = net(inputs).view([-1,1]).cuda()
    loss = criterion(outputs, labels).cuda()
    loss.backward()
    optimizer.step()
    
    print(batch, "% Trained", "loss = ", loss.item())

print('Finished Training')

In [None]:
net

In [None]:
input_data = data_torch.cuda()
net(input_data)

### Setup of the secure environment

In [None]:
hook = sy.TorchHook(torch)

def connect_to_workers(n_workers):
    return [
        sy.VirtualWorker(hook, id=f"worker{i+1}")
        for i in range(n_workers)
    ]

def connect_to_crypto_provider():
    return sy.VirtualWorker(hook, id="crypto_provider")

workers = connect_to_workers(n_workers=2)
crypto_provider = connect_to_crypto_provider()

def get_private_data_loaders(precision_fractional, workers, crypto_provider):
    
    def secret_share(tensor): #Transforms to fixed precision and secret share a tensor
        return (
            tensor
            .fix_precision(precision_fractional=precision_fractional)
            .share(*workers, crypto_provider=crypto_provider, requires_grad=True)
        )
    
    private_train_loader = [
        (secret_share(torch.Tensor(x_train[i*5:i*5+5])), secret_share(torch.Tensor(y_train[i*5:i*5+5])))
        for i in range (n_train_items)
        if i < n_train_items / 5
    ]
    
    private_test_loader = [
        (secret_share(torch.Tensor(x_test[i*5:i*5+5])), secret_share(torch.Tensor(y_test[i*5:i*5+5])))
        for i in range (n_train_items)
        if i < n_train_items / 5
    ]
    return private_train_loader, private_test_loader
    
    
private_train_loader, private_test_loader = get_private_data_loaders(
    precision_fractional=3,
    workers=workers,
    crypto_provider=crypto_provider
)

### Model for encrypted data

In [None]:
class Res1d(nn.Module):
    # the conv layers
    def __init__(self, inSize, outSize, kernel=(3,), strides=1,):
        super(Res1d, self).__init__()
        self.inSize = inSize
        self.outSize = outSize
        # hard-coded to do the padding correctly
        if inSize in (16,64,128,512) and strides is 2:
            pding = 0
        else:
            pding = 1
        self.l1 = nn.Conv1d(inSize, outSize, kernel, stride=strides, padding=pding, bias=False)
        self.l2 = nn.Identity()
        
        if strides > 1 or inSize != outSize:
            if strides > 1:
                self.r1 = nn.Identity()
                self.r2 = nn.AvgPool1d(strides)
            else:
                self.r1 = None
                self.r2 = None
            self.r3 = nn.Conv1d(inSize, outSize, 1, bias=False)
            self.r4 = nn.Identity()
            
    def forward(self, x):
        l = x
        l = self.l1(l)
        l = self.l2(l)
        
        if self.r1 is not None:
            r = self.r1(x)
            r = self.r2(r)
            r = self.r3(r)
            r = self.r4(r)
        else:
            r = self.r3(x)
            r = self.r4(r)
            
        x = l + r
        print("forwarding: ", self.inSize, self.outSize)
        return F.relu(x)
    
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.l1 = nn.Linear(dim, 64)
        self.l2 = F.relu
        
        self.r1 = Res1d(1, 4, 3)
        
        self.r2 = Res1d(4, 8, 3)
        self.r3 = Res1d(8, 8, 3, strides=2)
        
        self.r4 = Res1d(8, 16, 3)
        self.r5 = Res1d(16, 16, 3, strides=2)
        
        self.r6 = Res1d(16, 32, 3)
        self.r7 = Res1d(32, 32, 3, strides=2)
        
        self.r8 = Res1d(32, 64, 3)
        self.r9 = Res1d(64, 64, 3, strides=2)
        
        self.r10 = Res1d(64, 128, 3)
        self.r11 = Res1d(128, 128, 3, strides=2)
        
        self.r12 = Res1d(128, 256, 3)
        self.r13 = Res1d(256, 256, 3, strides=2)
        
        self.r14 = Res1d(256, 512, 3)
        self.r15 = Res1d(512, 512, 3, strides=2)
        
        self.r16 = Res1d(512, 1024, 3)
        self.r17 = Res1d(1024, 1024, 3, strides=2)
        
        # size is by experiment and hardcode
        self.lastLinear = nn.Linear(50240,32)
        self.lastRelu = F.relu
        self.lastAgg = nn.Linear(32,1)
        self.lastSigmoid = nn.Sigmoid()
    
    def forward(self, x):           
        # shape is (batch, channel, time)
        l = x
        l = x.view(x.shape[0],-1)
        l = self.l1(l)
        l = self.l2(l)

        # conv layers should operate on time
        r = x
        r = self.r1(r)
        r = self.r4(self.r3(self.r2(r)))
        r = self.r8(self.r7(self.r6(self.r5(r))))
        r = self.r12(self.r11(self.r10(self.r9(r))))
        r = self.r16(self.r15(self.r14(self.r13(r))))
        r = self.r17(r)
        
        # flatten l
        r = r.view(x.shape[0],-1)
        l = l.view(x.shape[0],-1)
        y = torch.cat((l,r),dim=1)
        y = self.lastLinear(y)
        y = self.lastRelu(y)
        y = self.lastAgg(y)
        y = self.lastSigmoid(y)
        return y

### Private Training of the model

In [None]:
def train(model, private_train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(private_train_loader): # <-- now it is a private dataset
        print('training...')
        start_time = time.time()
        optimizer.zero_grad()
        
        output = model(data)
        
        batch_size = output.shape[0]
        loss = ((output - target)**2).sum().refresh()/batch_size
        
        loss.backward()
        
        optimizer.step()

        
#Although the aim of this project was to provide private training, we prepared code for testing as well
def test(model, private_test_loader):
    print('testing...')
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in private_test_loader:
            start_time = time.time()
            
            output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target.view_as(pred)).sum()

    correct = correct.get().float_precision()

In [None]:
model = Net()
model = model.fix_precision().share(*workers, crypto_provider=crypto_provider, requires_grad=True)

optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optimizer.fix_precision() 
for epoch in range(1, 2):
    train(model, private_train_loader, optimizer, epoch)
    test(model, private_test_loader)