# Sentiment Analysis with SyferText

## Demo Plan

1. How to use SyferText pipelines.

In [8]:
# Some imports
import syft as sy
from syft.generic.string import String
import torch.nn.functional as F
import torch
import syfertext
import numpy as np
import tqdm
import csv
#from sklearn.model_selection import train_test_split

hook = sy.TorchHook(torch)





In [9]:
# Preparing the work environement
me = hook.local_worker
bob = sy.VirtualWorker(hook, id = 'bob')
alice = sy.VirtualWorker(hook, id = 'alice')

crypto_provider = sy.VirtualWorker(hook, id = 'crypto_provider')

nlp = syfertext.load('en_core_web_lg', owner = me)

## -3. Download the Dataset

## -2. Load the Dataset

In [10]:
# Set the path to the dataset file
dataset_path = '../../datasets/imdb/imdb.csv'

# store the dataset as a list of dictionaries
# each dictionary has two keys, 'review' and 'label'
# the 'review' element is a PySyft String
# the 'label' element is an integer with 1 for 'positive'
# and 0 for 'negative' review
dataset_local = []

with open(dataset_path, 'r') as dataset_file:
    
    # Create a csv reader object
    reader = csv.DictReader(dataset_file)
    
    for elem in reader:
        
        # Create one entry
        example = dict(review = String(elem['review']),
                       label = 1 if elem['sentiment'] == 'positive' else 0
                      )
        
        # add to the local dataset
        dataset_local.append(example)

In [11]:
dataset_local[0]['review'].owner, dataset_local[0]['label']

(<VirtualWorker id:me #objects:0>, 1)

## -1. Send the Dataset to Remote Workers

Let's cut the dataset into two equal parts and send each part to a different worker simulating two remote datasets:

In [12]:
# Create local splits
dataset_bob = dataset_local[:25000]
dataset_alice = dataset_local[25000:]

# Send the content of each split
for example in dataset_bob:
    example['review'] = example['review'].send(bob)
    
    one_hot_label = torch.zeros(2).scatter(0, torch.Tensor([example['label']]).long(), 1)
    example['label'] = one_hot_label.send(bob)
    
for example in dataset_alice:
    example['review'] = example['review'].send(alice)
    
    one_hot_label = torch.zeros(2).scatter(0, torch.Tensor([example['label']]).long(), 1)
    example['label'] = one_hot_label.send(alice)

In [13]:
print(type(dataset_bob[0]['review']))
print(dataset_bob[0]['review'].location)

print(type(dataset_bob[0]['label']))
print(dataset_bob[0]['label'].location)

<class 'syft.generic.pointers.string_pointer.StringPointer'>
<VirtualWorker id:bob #objects:50000>
<class 'torch.Tensor'>
<VirtualWorker id:bob #objects:50000>


-----------

## 0. Create a Dataset class

In [14]:
# This should go into the __getitem__ method of the dataloader

from torch.utils.data import Dataset

class DatasetIMDB(Dataset):
    
    def __init__(self, sets, workers, crypto_provider, nlp):
        
        self.sets = sets
        self.crypto_provider = crypto_provider
        self.workers = workers
    
        # Create a single dataset unifying all datasets
        self._create_dataset()
        
    def __getitem__(self, index):

        # get the example
        example = self.dataset[index]
        
        # Tokenize the string and get a doc pointer
        doc_ptr = nlp(example['review'])
        
        # Get the encrypted vector embedding for the document
        vector_enc = doc_ptr.get_encrypted_vector(bob, 
                                                  alice, 
                                                  crypto_provider = self.crypto_provider,
                                                  requires_grad = True
                                                 )
        
        
        # Encrypte the target label
        label_enc = example['label'].fix_precision().share(bob, 
                                                           alice, 
                                                           crypto_provider = self.crypto_provider,
                                                           requires_grad = True
                                                          ).get()
        
        return vector_enc, label_enc

    def __len__(self):
        
        # The size of the combined datasets
        return len(self.dataset)

    def _create_dataset(self):
        """Create a single dataset unifying examples from all remote datasets
        """
        # Initialize the dataset
        self.dataset = []
      
        # populate the dataset list
        for dataset in self.sets:
            for example in dataset:
                self.dataset.append(example)
                
    @staticmethod
    def collate_fn(batch):
        
        # Unzip the batch
        vectors, targets = list(zip(*batch))

        # concatenate the vectors
        vectors = torch.stack(vectors)
        
        #concatenate the labels
        targets = torch.stack(targets)
        
        return vectors, targets

"""
for example in dataset_local:
    
    # Tokenize
    doc = nlp(example['review'])
    for token in doc:
        
        try:
            print(token.vector)
            
        except KeyError: # Temporary fix while Bachir resolves the issue
            print(np.zeros(300))
    break
    # Get the doc vector
    #vector = doc.vector
"""

## 1. Create a DataLoader

In [15]:
learning_rate = 0.001
batch_size = 32
#learning_rate = 0.01
#batch_size = 128

In [16]:
from torch.utils.data import DataLoader

# Instantiate a Dataset object
trainset = DatasetIMDB(sets = [dataset_bob,
                               dataset_alice],
                       workers = [bob, alice],
                       crypto_provider = crypto_provider,
                       nlp = nlp
                      )

# Instantiate the DataLoader object
trainloader = DataLoader(trainset, shuffle = True,
                         batch_size = batch_size, num_workers = 0, 
                         collate_fn = trainset.collate_fn)



## 2. Create an Encrypted Classifier

In [17]:
class Classifier(torch.nn.Module):
    
    def __init__(self, in_features, out_features):
        super(Classifier, self).__init__()
        
        self.fc = torch.nn.Linear(in_features, out_features)
                
    def forward(self, x):
        
        logits = self.fc(x)
        
        #preds = self.sig(logits)
        probs = F.relu(logits)
        
        return probs, logits

Iniitialize and encrypt the classifier:

In [18]:
classifier = Classifier(in_features = 300, out_features = 2)

# Apply SMPC encryption
classifier = classifier.fix_precision().share(bob, alice, 
                                              crypto_provider = crypto_provider,
                                              requires_grad = True
                                              )
print(classifier)


Classifier(
  (fc): Linear(in_features=300, out_features=2, bias=True)
)


## 3. Start training

In [19]:
# Create an optimizer
import torch.optim as optim


optim = optim.SGD(params = classifier.parameters(),
                   lr = learning_rate).fix_precision()
'''
optim = optim.Adam(params = classifier.parameters(),
                   lr = learning_rate)
'''

'\noptim = optim.Adam(params = classifier.parameters(),\n                   lr = learning_rate)\n'

In [20]:
classifier.train()

for epoch in range(10):
    
    for vectors, targets in trainloader:

        # 1). Zero out previous gradients
        optim.zero_grad()

        # 2). predict sentiments
        probs, logits = classifier(vectors)
        #print(preds.shape, logits.shape)
        # 3). Compute loss and accuracy
        #loss = torch.nn.BCEWithLogitsLoss().fix_precision()(preds, train_labels.unsqueeze(1))
        #loss = torch.nn.BCEWithLogitsLoss()(preds, targets.unsqueeze(1))
        
        #loss = targets * torch.log(preds) + (1 - targets) * torch.log((1 - preds))
        #loss = - torch.mean(loss)
        loss = ((probs -  targets)**2).sum()#.refresh()# / len(train_data)

        # Get the predicted labels
        #pred_labels = (preds > 0.5).long().squeeze(1)
        #accuracy = torch.mean((pred_labels == targets).float()) * 100
        #print(probs.get())

        preds = probs.argmax(dim=1)
        targets = targets.argmax(dim=1)
        accuracy = preds.eq(targets).sum()
        accuracy = accuracy.get().float_precision()
        accuracy = 100 * (accuracy / batch_size)
        # 4). Backpropagate the loss
        loss.backward()

        # 5). Update weights
        optim.step()
        
        # print loss
        #print(loss.get().float_precision())
        print(f"Ep {epoch} |  train loss: {loss.get().float_precision()}, train accuracy {accuracy:2} %")
        #print(f"Ep {epoch} |  train loss: {loss}, train accuracy {accuracy:2} %")


Ep 0 |  train loss: 32.0, train accuracy 56.25 %
Ep 0 |  train loss: 32.0, train accuracy 56.25 %
Ep 0 |  train loss: 31.985000610351562, train accuracy 53.125 %
Ep 0 |  train loss: 31.929000854492188, train accuracy 50.0 %
Ep 0 |  train loss: 31.70400047302246, train accuracy 46.875 %
Ep 0 |  train loss: 26.20800018310547, train accuracy 56.25 %
Ep 0 |  train loss: 17.825000762939453, train accuracy 46.875 %
Ep 0 |  train loss: 16.364999771118164, train accuracy 46.875 %
Ep 0 |  train loss: 15.39900016784668, train accuracy 62.5 %
Ep 0 |  train loss: 19.198999404907227, train accuracy 34.375 %
Ep 0 |  train loss: 15.343999862670898, train accuracy 59.375 %
Ep 0 |  train loss: 17.95599937438965, train accuracy 37.5 %
Ep 0 |  train loss: 16.78700065612793, train accuracy 43.75 %
Ep 0 |  train loss: 16.79800033569336, train accuracy 31.25 %
Ep 0 |  train loss: 16.702999114990234, train accuracy 53.125 %
Ep 0 |  train loss: 15.774999618530273, train accuracy 56.25 %
Ep 0 |  train loss: 18

Ep 0 |  train loss: 15.786999702453613, train accuracy 46.875 %
Ep 0 |  train loss: 15.11400032043457, train accuracy 65.625 %
Ep 0 |  train loss: 16.399999618530273, train accuracy 43.75 %
Ep 0 |  train loss: 14.520999908447266, train accuracy 65.625 %
Ep 0 |  train loss: 13.99899959564209, train accuracy 56.25 %
Ep 0 |  train loss: 15.800999641418457, train accuracy 56.25 %
Ep 0 |  train loss: 15.642000198364258, train accuracy 50.0 %
Ep 0 |  train loss: 14.565999984741211, train accuracy 68.75 %
Ep 0 |  train loss: 14.86299991607666, train accuracy 62.5 %
Ep 0 |  train loss: 14.081999778747559, train accuracy 75.0 %
Ep 0 |  train loss: 15.527999877929688, train accuracy 53.125 %
Ep 0 |  train loss: 15.032999992370605, train accuracy 50.0 %
Ep 0 |  train loss: 15.034000396728516, train accuracy 65.625 %
Ep 0 |  train loss: 15.246000289916992, train accuracy 53.125 %
Ep 0 |  train loss: 15.079999923706055, train accuracy 68.75 %
Ep 0 |  train loss: 14.357000350952148, train accuracy 6

Ep 0 |  train loss: 15.199000358581543, train accuracy 59.375 %
Ep 0 |  train loss: 13.75, train accuracy 68.75 %
Ep 0 |  train loss: 13.545999526977539, train accuracy 68.75 %
Ep 0 |  train loss: 15.92300033569336, train accuracy 53.125 %
Ep 0 |  train loss: 14.069999694824219, train accuracy 81.25 %
Ep 0 |  train loss: 14.131999969482422, train accuracy 68.75 %
Ep 0 |  train loss: 14.409000396728516, train accuracy 62.5 %
Ep 0 |  train loss: 12.87399959564209, train accuracy 75.0 %
Ep 0 |  train loss: 12.156000137329102, train accuracy 71.875 %
Ep 0 |  train loss: 14.11299991607666, train accuracy 71.875 %
Ep 0 |  train loss: 13.996999740600586, train accuracy 71.875 %
Ep 0 |  train loss: 13.387999534606934, train accuracy 71.875 %
Ep 0 |  train loss: 13.111000061035156, train accuracy 81.25 %
Ep 0 |  train loss: 16.534000396728516, train accuracy 40.625 %
Ep 0 |  train loss: 14.60099983215332, train accuracy 65.625 %
Ep 0 |  train loss: 13.428999900817871, train accuracy 71.875 %
Ep

Ep 0 |  train loss: 11.909000396728516, train accuracy 71.875 %
Ep 0 |  train loss: 12.638999938964844, train accuracy 75.0 %
Ep 0 |  train loss: 12.704000473022461, train accuracy 81.25 %
Ep 0 |  train loss: 12.927000045776367, train accuracy 71.875 %
Ep 0 |  train loss: 15.626999855041504, train accuracy 56.25 %
Ep 0 |  train loss: 11.789999961853027, train accuracy 84.375 %
Ep 0 |  train loss: 12.192000389099121, train accuracy 71.875 %
Ep 0 |  train loss: 13.88700008392334, train accuracy 62.5 %
Ep 0 |  train loss: 14.95300006866455, train accuracy 62.5 %
Ep 0 |  train loss: 12.711999893188477, train accuracy 75.0 %
Ep 0 |  train loss: 13.555000305175781, train accuracy 71.875 %
Ep 0 |  train loss: 14.494999885559082, train accuracy 65.625 %
Ep 0 |  train loss: 13.357000350952148, train accuracy 62.5 %
Ep 0 |  train loss: 11.83899974822998, train accuracy 87.5 %
Ep 0 |  train loss: 14.937999725341797, train accuracy 62.5 %
Ep 0 |  train loss: 13.199000358581543, train accuracy 65.6

Ep 0 |  train loss: 13.638999938964844, train accuracy 68.75 %
Ep 0 |  train loss: 14.654000282287598, train accuracy 65.625 %
Ep 0 |  train loss: 11.144000053405762, train accuracy 87.5 %
Ep 0 |  train loss: 13.87600040435791, train accuracy 53.125 %
Ep 0 |  train loss: 14.107999801635742, train accuracy 65.625 %
Ep 0 |  train loss: 12.305999755859375, train accuracy 84.375 %
Ep 0 |  train loss: 12.147000312805176, train accuracy 71.875 %
Ep 0 |  train loss: 10.267999649047852, train accuracy 87.5 %
Ep 0 |  train loss: 13.706999778747559, train accuracy 65.625 %
Ep 0 |  train loss: 11.133000373840332, train accuracy 81.25 %
Ep 0 |  train loss: 13.145000457763672, train accuracy 84.375 %
Ep 0 |  train loss: 12.564000129699707, train accuracy 68.75 %
Ep 0 |  train loss: 11.883999824523926, train accuracy 68.75 %
Ep 0 |  train loss: 11.383000373840332, train accuracy 75.0 %
Ep 0 |  train loss: 12.255000114440918, train accuracy 78.125 %
Ep 0 |  train loss: 12.435999870300293, train accur

Ep 0 |  train loss: 14.42300033569336, train accuracy 62.5 %
Ep 0 |  train loss: 13.317000389099121, train accuracy 65.625 %
Ep 0 |  train loss: 13.57800006866455, train accuracy 81.25 %
Ep 0 |  train loss: 12.10099983215332, train accuracy 78.125 %
Ep 0 |  train loss: 10.71500015258789, train accuracy 84.375 %
Ep 0 |  train loss: 11.866000175476074, train accuracy 68.75 %
Ep 0 |  train loss: 12.319999694824219, train accuracy 75.0 %
Ep 0 |  train loss: 12.086999893188477, train accuracy 68.75 %
Ep 0 |  train loss: 12.29699993133545, train accuracy 75.0 %
Ep 0 |  train loss: 11.76099967956543, train accuracy 81.25 %
Ep 0 |  train loss: 13.484000205993652, train accuracy 71.875 %
Ep 0 |  train loss: 14.003000259399414, train accuracy 62.5 %
Ep 0 |  train loss: 12.279000282287598, train accuracy 75.0 %
Ep 0 |  train loss: 12.633000373840332, train accuracy 68.75 %
Ep 0 |  train loss: 12.008000373840332, train accuracy 90.625 %
Ep 0 |  train loss: 13.102999687194824, train accuracy 78.125

Ep 0 |  train loss: 12.763999938964844, train accuracy 68.75 %
Ep 0 |  train loss: 12.482000350952148, train accuracy 75.0 %
Ep 0 |  train loss: 11.99899959564209, train accuracy 71.875 %
Ep 0 |  train loss: 11.46500015258789, train accuracy 78.125 %
Ep 0 |  train loss: 12.20300006866455, train accuracy 65.625 %
Ep 0 |  train loss: 12.105999946594238, train accuracy 75.0 %
Ep 0 |  train loss: 14.414999961853027, train accuracy 56.25 %
Ep 0 |  train loss: 11.989999771118164, train accuracy 68.75 %
Ep 0 |  train loss: 8.982999801635742, train accuracy 84.375 %
Ep 0 |  train loss: 11.90999984741211, train accuracy 75.0 %
Ep 0 |  train loss: 11.4350004196167, train accuracy 75.0 %
Ep 0 |  train loss: 10.446000099182129, train accuracy 84.375 %
Ep 0 |  train loss: 10.355999946594238, train accuracy 78.125 %
Ep 0 |  train loss: 11.638999938964844, train accuracy 71.875 %
Ep 0 |  train loss: 11.829999923706055, train accuracy 87.5 %
Ep 0 |  train loss: 10.29800033569336, train accuracy 78.125

Ep 0 |  train loss: 11.628999710083008, train accuracy 68.75 %
Ep 0 |  train loss: 10.067000389099121, train accuracy 81.25 %
Ep 0 |  train loss: 10.491000175476074, train accuracy 90.625 %
Ep 0 |  train loss: 12.972000122070312, train accuracy 65.625 %
Ep 0 |  train loss: 10.357000350952148, train accuracy 84.375 %
Ep 0 |  train loss: 12.406000137329102, train accuracy 65.625 %
Ep 0 |  train loss: 10.37399959564209, train accuracy 78.125 %
Ep 0 |  train loss: 14.350000381469727, train accuracy 68.75 %
Ep 0 |  train loss: 11.322999954223633, train accuracy 71.875 %
Ep 0 |  train loss: 12.934000015258789, train accuracy 68.75 %
Ep 0 |  train loss: 9.967000007629395, train accuracy 87.5 %
Ep 0 |  train loss: 9.550000190734863, train accuracy 81.25 %
Ep 0 |  train loss: 12.63599967956543, train accuracy 71.875 %
Ep 0 |  train loss: 11.664999961853027, train accuracy 84.375 %
Ep 0 |  train loss: 12.644000053405762, train accuracy 71.875 %
Ep 0 |  train loss: 11.461999893188477, train accur

Ep 0 |  train loss: 11.545999526977539, train accuracy 71.875 %
Ep 0 |  train loss: 8.6899995803833, train accuracy 87.5 %
Ep 0 |  train loss: 10.805999755859375, train accuracy 78.125 %
Ep 0 |  train loss: 12.923999786376953, train accuracy 62.5 %
Ep 0 |  train loss: 10.696000099182129, train accuracy 81.25 %
Ep 0 |  train loss: 10.710000038146973, train accuracy 81.25 %
Ep 0 |  train loss: 12.930000305175781, train accuracy 71.875 %
Ep 0 |  train loss: 11.461000442504883, train accuracy 81.25 %
Ep 0 |  train loss: 11.277000427246094, train accuracy 75.0 %
Ep 0 |  train loss: 12.23799991607666, train accuracy 71.875 %
Ep 0 |  train loss: 11.918999671936035, train accuracy 78.125 %
Ep 0 |  train loss: 14.895000457763672, train accuracy 65.625 %
Ep 0 |  train loss: 9.029000282287598, train accuracy 93.75 %
Ep 0 |  train loss: 11.598999977111816, train accuracy 65.625 %
Ep 0 |  train loss: 12.27299976348877, train accuracy 81.25 %
Ep 0 |  train loss: 10.541000366210938, train accuracy 81

Ep 0 |  train loss: 11.331000328063965, train accuracy 78.125 %
Ep 0 |  train loss: 11.656000137329102, train accuracy 71.875 %
Ep 0 |  train loss: 10.918999671936035, train accuracy 78.125 %
Ep 0 |  train loss: 10.289999961853027, train accuracy 81.25 %
Ep 0 |  train loss: 12.649999618530273, train accuracy 62.5 %
Ep 0 |  train loss: 11.5600004196167, train accuracy 75.0 %
Ep 0 |  train loss: 9.918999671936035, train accuracy 87.5 %
Ep 0 |  train loss: 12.397000312805176, train accuracy 71.875 %
Ep 0 |  train loss: 8.883999824523926, train accuracy 87.5 %
Ep 0 |  train loss: 12.965999603271484, train accuracy 65.625 %
Ep 0 |  train loss: 10.491999626159668, train accuracy 75.0 %
Ep 0 |  train loss: 10.777000427246094, train accuracy 68.75 %
Ep 0 |  train loss: 12.60200023651123, train accuracy 68.75 %
Ep 0 |  train loss: 10.972000122070312, train accuracy 65.625 %
Ep 0 |  train loss: 11.074000358581543, train accuracy 68.75 %
Ep 0 |  train loss: 10.425999641418457, train accuracy 81.2

Ep 0 |  train loss: 10.593000411987305, train accuracy 75.0 %
Ep 0 |  train loss: 10.520000457763672, train accuracy 71.875 %
Ep 0 |  train loss: 11.00100040435791, train accuracy 75.0 %
Ep 0 |  train loss: 11.442000389099121, train accuracy 75.0 %
Ep 0 |  train loss: 12.197999954223633, train accuracy 75.0 %
Ep 0 |  train loss: 9.991000175476074, train accuracy 71.875 %
Ep 0 |  train loss: 11.008000373840332, train accuracy 75.0 %
Ep 0 |  train loss: 9.958000183105469, train accuracy 87.5 %
Ep 0 |  train loss: 9.64900016784668, train accuracy 84.375 %
Ep 0 |  train loss: 12.576000213623047, train accuracy 65.625 %
Ep 0 |  train loss: 11.684000015258789, train accuracy 68.75 %
Ep 0 |  train loss: 10.85200023651123, train accuracy 78.125 %
Ep 0 |  train loss: 10.895000457763672, train accuracy 78.125 %
Ep 0 |  train loss: 10.121999740600586, train accuracy 78.125 %
Ep 0 |  train loss: 9.888999938964844, train accuracy 87.5 %
Ep 0 |  train loss: 11.625, train accuracy 78.125 %
Ep 0 |  tr

Ep 0 |  train loss: 10.640000343322754, train accuracy 84.375 %
Ep 0 |  train loss: 10.45300006866455, train accuracy 84.375 %
Ep 0 |  train loss: 14.930000305175781, train accuracy 65.625 %
Ep 0 |  train loss: 12.972999572753906, train accuracy 65.625 %
Ep 0 |  train loss: 10.46399974822998, train accuracy 75.0 %
Ep 0 |  train loss: 9.704000473022461, train accuracy 81.25 %
Ep 0 |  train loss: 9.269000053405762, train accuracy 84.375 %
Ep 0 |  train loss: 11.154000282287598, train accuracy 75.0 %
Ep 0 |  train loss: 9.916000366210938, train accuracy 81.25 %
Ep 0 |  train loss: 9.210000038146973, train accuracy 78.125 %
Ep 0 |  train loss: 13.640000343322754, train accuracy 65.625 %
Ep 0 |  train loss: 12.178999900817871, train accuracy 75.0 %
Ep 0 |  train loss: 12.854000091552734, train accuracy 59.375 %
Ep 0 |  train loss: 13.387999534606934, train accuracy 65.625 %
Ep 0 |  train loss: 9.84000015258789, train accuracy 81.25 %
Ep 0 |  train loss: 9.949000358581543, train accuracy 81.

AttributeError: 'FixedPrecisionTensor' object has no attribute 'fix_prec'

### TODO:

1. **Create a pipe element to perform polarity cutoff:**

```
word_polarity = pass # dict of the form word: polarity
polarity_filter = PolarityFilter(data = word_polarity,
                                 max_pol = 1.3, 
                                 min_pol = -1.3, # tokens whos polarity between [min_pol, max_pol] 
                                                 # will be tagged 'in_doc_vectors' = False
                                 tag = 'in_doc_vector' # the tag in which to store the True of False result
                                )

# create an element that computes a doc vector
# only by considering the elements tagged as 
# sepcifified by 'tag'
doc_vectorizer = DocVectorizer(tag = 'in_doc_vector')

nlp.add_pipe(polarity_filter)
nlp.add_pipe(doc_vectorizer)
```

2. **create a module in SyferText where the Dataset object for IMDB movie reviews can be imported:**

(or DatasetIMDB can just return a list of sets, each is of the form [{'review': <string>, 'sentiment': int}, ...])
and then we create the dataset object out of this
    
```
from syfertext.datasets.imdb import DatasetIMDB
from torch.utils.data import DataLoader

# This loads the IMDB PyTorch Dataset object
# it will download the dataset, load it locally 
# then sends it to the specified workers 
# we can specify the proportion each worker takes
# of each of the classes. for instance
# bob here takes 30% of positive examples from the 
# original dataset and 20% of its negative examples
trainset = DatasetIMDB(bob, alice, 
                       dist = {'bob': {'pos' : 30, 'neg': 20},
                               'alice': {'pos' 70, 'neg': 50}},
                       mode = 'train'
                       pipes = [polarity_filter, doc_vectorizer]
                      )
                      
valset = DatasetIMDB(bob, alice, 
                     dist = {'bob': {'pos' : 50, 'neg': 50},
                             'alice': {'pos' 50, 'neg': 50}},
                     mode = 'val',
                     pipes = [polarity_filter, doc_vectorizer]
                    )
                            
# Create a torch data loaders
trainloader = DataLoader(trainset, shuffle = True,
                         batchsize = 16, workers = 2)
                  
valloader = DataLoader(valset, shuffle = False,
                         batchsize = 16, workers = 2)
     
```