# Sentiment Analysis with SyferText

## Demo Plan

1. How to use SyferText pipelines.

In [1]:
# Some imports
import syft as sy
from syft.generic.string import String
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import syfertext
import numpy as np
import tqdm
import csv
#from sklearn.model_selection import train_test_split
hook = sy.TorchHook(torch)





Let's prepare the work environement:

In [2]:
# Create some PySyft workers
me = hook.local_worker
bob = sy.VirtualWorker(hook, id = 'bob')
alice = sy.VirtualWorker(hook, id = 'alice')
crypto_provider = sy.VirtualWorker(hook, id = 'crypto_provider')

# Create a summary writer for logging performance with tensorboard
writer = SummaryWriter()

# Create a Language object with SyferText
nlp = syfertext.load('en_core_web_lg', owner = me)

## -3. Download the Dataset

## -2. Load the Dataset

In [3]:
# Set the path to the dataset file
dataset_path = '../../datasets/imdb/imdb.csv'

# store the dataset as a list of dictionaries
# each dictionary has two keys, 'review' and 'label'
# the 'review' element is a PySyft String
# the 'label' element is an integer with 1 for 'positive'
# and 0 for 'negative' review
dataset_local = []

with open(dataset_path, 'r') as dataset_file:
    
    # Create a csv reader object
    reader = csv.DictReader(dataset_file)
    
    for elem in reader:
        
        # Create one entry
        example = dict(review = String(elem['review']),
                       label = 1 if elem['sentiment'] == 'positive' else 0
                      )
        
        # add to the local dataset
        dataset_local.append(example)

In [4]:
dataset_local[0]['review'].owner, dataset_local[0]['label']

(<VirtualWorker id:me #objects:0>, 1)

## -1. Send the Dataset to Remote Workers

Let's cut the dataset into two equal parts and send each part to a different worker simulating two remote datasets:

In [5]:
# Create local splits
dataset_bob = dataset_local[:25000]
dataset_alice = dataset_local[25000:]

# Send the content of each split
for example in dataset_bob:
    example['review'] = example['review'].send(bob)
    
    one_hot_label = torch.zeros(2).scatter(0, torch.Tensor([example['label']]).long(), 1)
    example['label'] = one_hot_label.send(bob)
    
for example in dataset_alice:
    example['review'] = example['review'].send(alice)
    
    one_hot_label = torch.zeros(2).scatter(0, torch.Tensor([example['label']]).long(), 1)
    example['label'] = one_hot_label.send(alice)

In [6]:
print(type(dataset_bob[0]['review']))
print(dataset_bob[0]['review'].location)

print(type(dataset_bob[0]['label']))
print(dataset_bob[0]['label'].location)

<class 'syft.generic.pointers.string_pointer.StringPointer'>
<VirtualWorker id:bob #objects:50000>
<class 'torch.Tensor'>
<VirtualWorker id:bob #objects:50000>


-----------

## 0. Create a Dataset class

In [7]:
# This should go into the __getitem__ method of the dataloader

from torch.utils.data import Dataset

class DatasetIMDB(Dataset):
    
    def __init__(self, sets, workers, crypto_provider, nlp):
        
        self.sets = sets
        self.crypto_provider = crypto_provider
        self.workers = workers
    
        # Create a single dataset unifying all datasets
        self._create_dataset()
        
    def __getitem__(self, index):

        # get the example
        example = self.dataset[index]
        
        # Tokenize the string and get a doc pointer
        doc_ptr = nlp(example['review'])
        
        # Get the encrypted vector embedding for the document
        vector_enc = doc_ptr.get_encrypted_vector(bob, 
                                                  alice, 
                                                  crypto_provider = self.crypto_provider,
                                                  requires_grad = True
                                                 )
        
        
        # Encrypte the target label
        label_enc = example['label'].fix_precision().share(bob, 
                                                           alice, 
                                                           crypto_provider = self.crypto_provider,
                                                           requires_grad = True
                                                          ).get()
        
        return vector_enc, label_enc

    def __len__(self):
        
        # The size of the combined datasets
        return len(self.dataset)

    def _create_dataset(self):
        """Create a single dataset unifying examples from all remote datasets
        """
        # Initialize the dataset
        self.dataset = []
      
        # populate the dataset list
        for dataset in self.sets:
            for example in dataset:
                self.dataset.append(example)
                
    @staticmethod
    def collate_fn(batch):
        
        # Unzip the batch
        vectors, targets = list(zip(*batch))

        # concatenate the vectors
        vectors = torch.stack(vectors)
        
        #concatenate the labels
        targets = torch.stack(targets)
        
        return vectors, targets

"""
for example in dataset_local:
    
    # Tokenize
    doc = nlp(example['review'])
    for token in doc:
        
        try:
            print(token.vector)
            
        except KeyError: # Temporary fix while Bachir resolves the issue
            print(np.zeros(300))
    break
    # Get the doc vector
    #vector = doc.vector
"""

## 1. Create a DataLoader

In [8]:
learning_rate = 0.001
batch_size = 32
#learning_rate = 0.01
#batch_size = 128

In [9]:
from torch.utils.data import DataLoader

# Instantiate a Dataset object
trainset = DatasetIMDB(sets = [dataset_bob,
                               dataset_alice],
                       workers = [bob, alice],
                       crypto_provider = crypto_provider,
                       nlp = nlp
                      )

# Instantiate the DataLoader object
trainloader = DataLoader(trainset, shuffle = True,
                         batch_size = batch_size, num_workers = 0, 
                         collate_fn = trainset.collate_fn)



## 2. Create an Encrypted Classifier

In [10]:
class Classifier(torch.nn.Module):
    
    def __init__(self, in_features, out_features):
        super(Classifier, self).__init__()
        
        self.fc = torch.nn.Linear(in_features, out_features)
                
    def forward(self, x):
        
        logits = self.fc(x)
        
        #preds = self.sig(logits)
        probs = F.relu(logits)
        
        return probs, logits

Iniitialize and encrypt the classifier:

In [11]:
classifier = Classifier(in_features = 300, out_features = 2)

# Apply SMPC encryption
classifier = classifier.fix_precision().share(bob, alice, 
                                              crypto_provider = crypto_provider,
                                              requires_grad = True
                                              )
print(classifier)


Classifier(
  (fc): Linear(in_features=300, out_features=2, bias=True)
)


## 3. Start training

In [12]:
# Create an optimizer
import torch.optim as optim


optim = optim.SGD(params = classifier.parameters(),
                   lr = learning_rate).fix_precision()
'''
optim = optim.Adam(params = classifier.parameters(),
                   lr = learning_rate)
'''

'\noptim = optim.Adam(params = classifier.parameters(),\n                   lr = learning_rate)\n'

In [13]:
classifier.train()

for epoch in range(10):
    
    for iter, (vectors, targets) in enumerate(trainloader):

        # 1). Zero out previous gradients
        optim.zero_grad()

        # 2). predict sentiments
        probs, logits = classifier(vectors)
        #print(preds.shape, logits.shape)
        # 3). Compute loss and accuracy
        #loss = torch.nn.BCEWithLogitsLoss().fix_precision()(preds, train_labels.unsqueeze(1))
        #loss = torch.nn.BCEWithLogitsLoss()(preds, targets.unsqueeze(1))
        
        #loss = targets * torch.log(preds) + (1 - targets) * torch.log((1 - preds))
        #loss = - torch.mean(loss)
        loss = ((probs -  targets)**2).sum()#.refresh()# / len(train_data)
        #loss = ((probs -  targets)**2).sum() / batch_size

        # Get the predicted labels
        #pred_labels = (preds > 0.5).long().squeeze(1)
        #accuracy = torch.mean((pred_labels == targets).float()) * 100
        #print(probs.get())

        preds = probs.argmax(dim=1)
        targets = targets.argmax(dim=1)
        accuracy = preds.eq(targets).sum()
        accuracy = accuracy.get().float_precision()
        accuracy = 100 * (accuracy / batch_size)
        # 4). Backpropagate the loss
        loss.backward()

        # 5). Update weights
        optim.step()
        
        # print loss
        #print(loss.get().float_precision())
        
        # decrypt the loss
        loss = loss.get().float_precision()
        #loss = loss / batch_size
        print(f"Ep {epoch} |  train loss: {loss}, train accuracy {accuracy:2} %")
        #print(f"Ep {epoch} |  train loss: {loss}, train accuracy {accuracy:2} %")
        
        # Log to tensorboard
        writer.add_scalar('train/loss', loss, epoch * len(trainloader) + iter )
        writer.add_scalar('train/acc', accuracy, epoch * len(trainloader) + iter )


writer.close()

Ep 0 |  train loss: -144115194920960.0, train accuracy 56.25 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 65.625 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 53.125 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 53.125 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 40.625 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 50.0 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 37.5 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 65.625 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 43.75 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 53.125 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 56.25 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 43.75 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 56.25 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 68.75 %
Ep 0 |  train loss: -144115194920960.0, train accuracy 53.125 %
Ep 0 |  train loss: -144115194920960.0, train accu

KeyboardInterrupt: 

### TODO:

1. **Create a pipe element to perform polarity cutoff:**

```
word_polarity = pass # dict of the form word: polarity
polarity_filter = PolarityFilter(data = word_polarity,
                                 max_pol = 1.3, 
                                 min_pol = -1.3, # tokens whos polarity between [min_pol, max_pol] 
                                                 # will be tagged 'in_doc_vectors' = False
                                 tag = 'in_doc_vector' # the tag in which to store the True of False result
                                )

# create an element that computes a doc vector
# only by considering the elements tagged as 
# sepcifified by 'tag'
doc_vectorizer = DocVectorizer(tag = 'in_doc_vector')

nlp.add_pipe(polarity_filter)
nlp.add_pipe(doc_vectorizer)
```

2. **create a module in SyferText where the Dataset object for IMDB movie reviews can be imported:**

(or DatasetIMDB can just return a list of sets, each is of the form [{'review': <string>, 'sentiment': int}, ...])
and then we create the dataset object out of this
    
```
from syfertext.datasets.imdb import DatasetIMDB
from torch.utils.data import DataLoader

# This loads the IMDB PyTorch Dataset object
# it will download the dataset, load it locally 
# then sends it to the specified workers 
# we can specify the proportion each worker takes
# of each of the classes. for instance
# bob here takes 30% of positive examples from the 
# original dataset and 20% of its negative examples
trainset = DatasetIMDB(bob, alice, 
                       dist = {'bob': {'pos' : 30, 'neg': 20},
                               'alice': {'pos' 70, 'neg': 50}},
                       mode = 'train'
                       pipes = [polarity_filter, doc_vectorizer]
                      )
                      
valset = DatasetIMDB(bob, alice, 
                     dist = {'bob': {'pos' : 50, 'neg': 50},
                             'alice': {'pos' 50, 'neg': 50}},
                     mode = 'val',
                     pipes = [polarity_filter, doc_vectorizer]
                    )
                            
# Create a torch data loaders
trainloader = DataLoader(trainset, shuffle = True,
                         batchsize = 16, workers = 2)
                  
valloader = DataLoader(valset, shuffle = False,
                         batchsize = 16, workers = 2)
     
```