# Sentiment Analysis with SyferText

## Problem Statement (use it as twitter)

## Demo Plan

1. How to use SyferText pipelines.

In [59]:
# Some imports
import syft as sy
from syft.generic.string import String
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import syfertext
import numpy as np
import tqdm
import csv
from sklearn.model_selection import train_test_split

hook = sy.TorchHook(torch)





Let's prepare the work environement:

In [60]:
# Create some PySyft workers
me = hook.local_worker
bob = sy.VirtualWorker(hook, id = 'bob')
alice = sy.VirtualWorker(hook, id = 'alice')
crypto_provider = sy.VirtualWorker(hook, id = 'crypto_provider')

# Create a summary writer for logging performance with tensorboard
writer = SummaryWriter()

# Create a Language object with SyferText
nlp = syfertext.load('en_core_web_lg', owner = me)

## -3. Download the Dataset

## -2. Load the Dataset

In [3]:
# Set the path to the dataset file
dataset_path = '../../../datasets/imdb/imdb.csv'

# store the dataset as a list of dictionaries
# each dictionary has two keys, 'review' and 'label'
# the 'review' element is a PySyft String
# the 'label' element is an integer with 1 for 'positive'
# and 0 for 'negative' review
dataset_local = []

with open(dataset_path, 'r') as dataset_file:
    
    # Create a csv reader object
    reader = csv.DictReader(dataset_file)
    
    for elem in reader:
        
        # Create one entry
        example = dict(review = String(elem['review']),
                       label = 1 if elem['sentiment'] == 'positive' else 0
                      )
        
        # add to the local dataset
        dataset_local.append(example)

In [4]:
dataset_local[0]['review'].owner, dataset_local[0]['label']

(<VirtualWorker id:me #objects:0>, 1)

## -1. Send the Dataset to Remote Workers

Let's cut the dataset into two equal parts and send each part to a different worker simulating two remote datasets:

In [5]:
# Create two datasets, one for Bob, and the other for Alic
dataset_bob, dataset_alice = train_test_split(dataset_local, train_size = 0.5)

# Now create a validation set for Bob, and another for Alice
train_bob, val_bob = train_test_split(dataset_bob, train_size = 0.7)
train_alice, val_alice = train_test_split(dataset_alice, train_size = 0.7)

In [6]:
# Send the content of each split
def make_remote_dataset(dataset, worker):

    # Got through each example in the dataset
    for example in dataset:
        
        # Send each review text
        example['review'] = example['review'].send(worker)

        # Send each label as a one-hot-enceded vector
        one_hot_label = torch.zeros(2).scatter(0, torch.Tensor([example['label']]).long(), 1)
        example['label'] = one_hot_label.send(worker)


Make the datasets remote

In [7]:
make_remote_dataset(train_bob, bob)
make_remote_dataset(val_bob, bob)

make_remote_dataset(train_alice, alice)
make_remote_dataset(val_alice, alice)

In [8]:
print(type(train_bob[0]['review']))
print(train_bob[0]['review'].location)

print(type(train_bob[0]['label']))
print(train_bob[0]['label'].location)

<class 'syft.generic.pointers.string_pointer.StringPointer'>
<VirtualWorker id:bob #objects:50000>
<class 'torch.Tensor'>
<VirtualWorker id:bob #objects:50000>


-----------

## 0. Create a Dataset class

In [9]:
# This should go into the __getitem__ method of the dataloader

from torch.utils.data import Dataset

class DatasetIMDB(Dataset):
    
    def __init__(self, sets, share_workers, crypto_provider, nlp):
        
        self.sets = sets
        self.crypto_provider = crypto_provider
        self.workers = share_workers
    
        # Create a single dataset unifying all datasets
        self._create_dataset()
        
    def __getitem__(self, index):

        # get the example
        example = self.dataset[index]
        
        # Tokenize the string and get a doc pointer
        doc_ptr = nlp(example['review'])
        
        # Get the encrypted vector embedding for the document
        vector_enc = doc_ptr.get_encrypted_vector(bob, 
                                                  alice, 
                                                  crypto_provider = self.crypto_provider,
                                                  requires_grad = True
                                                 )
        try:
            # Encrypte the target label
            label_enc = example['label'].fix_precision().share(bob, 
                                                               alice, 
                                                               crypto_provider = self.crypto_provider,
                                                               requires_grad = True
                                                              ).get()
        except AttributeError:
            label_enc = example['label'].share(bob, 
                                                               alice, 
                                                               crypto_provider = self.crypto_provider,
                                                               requires_grad = True
                                                              ).get()
            
        return vector_enc, label_enc

    def __len__(self):
        
        # The size of the combined datasets
        return len(self.dataset)

    def _create_dataset(self):
        """Create a single dataset unifying examples from all remote datasets
        """
        # Initialize the dataset
        self.dataset = []
      
        # populate the dataset list
        for dataset in self.sets:
            for example in dataset:
                self.dataset.append(example)
                
    @staticmethod
    def collate_fn(batch):
        
        # Unzip the batch
        vectors, targets = list(zip(*batch))

        # concatenate the vectors
        vectors = torch.stack(vectors)
        
        #concatenate the labels
        targets = torch.stack(targets)
        
        return vectors, targets

## 1. Create a DataLoader

In [10]:
learning_rate = 0.001
batch_size = 32
#learning_rate = 0.01
#batch_size = 128

Now, I should create the dataset handlers for training and validation as well as data loaders.

This is a key step. You will notice that, although we have two different training sets, I create only one dataset handler for training and only one for validation. 

The training set handler will treat both training sets at bob's and alice's machine as one single big dataset.

When the dataloader creates training batches, these batches will be composed of randomly picked examples for Bob and Alice's training sets. What makes mixing dataset in this way, without violating privacy is that fact that those training examples will be encrypted using SMPC with the help of PySyft.

SyferText, makes it possible to process plain text at the remote worker without the need to transfer this data to the model owner's machine (us in this case). It also allows to encrypt the embedding vectors using PySyft before they quite their home machine.

In [11]:
from torch.utils.data import DataLoader

# Instantiate a training Dataset object
trainset = DatasetIMDB(sets = [train_bob,
                               train_alice],
                       share_workers = [bob, alice],
                       crypto_provider = crypto_provider,
                       nlp = nlp
                      )

# Instantiate a validation Dataset object
valset = DatasetIMDB(sets = [val_bob,
                             val_alice],
                     share_workers = [bob, alice],
                     crypto_provider = crypto_provider,
                     nlp = nlp
                    )

# Instantiate the DataLoader object for the training set
trainloader = DataLoader(trainset, shuffle = True,
                         batch_size = batch_size, num_workers = 0, 
                         collate_fn = trainset.collate_fn)


# Instantiate the DataLoader object for the validation set
valloader = DataLoader(valset, shuffle = True,
                       batch_size = batch_size, num_workers = 0, 
                       collate_fn = valset.collate_fn)



## 2. Create an Encrypted Classifier

In [12]:
class Classifier(torch.nn.Module):
    
    def __init__(self, in_features, out_features):
        super(Classifier, self).__init__()
        
        self.fc = torch.nn.Linear(in_features, out_features)
                
    def forward(self, x):
        
        logits = self.fc(x)
        
        #preds = self.sig(logits)
        probs = F.relu(logits)
        
        return probs, logits

Iniitialize and encrypt the classifier:

In [13]:
classifier = Classifier(in_features = 300, out_features = 2)

# Apply SMPC encryption
classifier = classifier.fix_precision().share(bob, alice, 
                                              crypto_provider = crypto_provider,
                                              requires_grad = True
                                              )
print(classifier)


Classifier(
  (fc): Linear(in_features=300, out_features=2, bias=True)
)


## 3. Start training

In [66]:
a = torch.Tensor([2., 3.])
a.fix_precision()
a

tensor([2., 3.])

(Wrapper)>[PointerTensor | me:23438792066 -> bob:18181931159]

(Wrapper)>FixedPrecisionTensor>tensor([2000, 3000])

In [14]:
# Create an optimizer
import torch.optim as optim


optim = optim.SGD(params = classifier.parameters(),
                   lr = learning_rate).fix_precision()
'''
optim = optim.Adam(params = classifier.parameters(),
                   lr = learning_rate)
'''

'\noptim = optim.Adam(params = classifier.parameters(),\n                   lr = learning_rate)\n'

In [15]:
def cycle(dataloader):
    while True:
        for vector, target in dataloader:
            yield vector, target

In [None]:
classifier.train()

for epoch in range(10):
    
    for iter, (vectors, targets) in enumerate(trainloader):

        classifier.train()

        # 1). Zero out previous gradients
        optim.zero_grad()

        # 2). predict sentiment probabilities
        probs, logits = classifier(vectors)

        # 3). Compute loss and accuracy
        loss = ((probs -  targets)**2).sum()#.refresh()# / len(train_data)


        # Get the predicted labels
        preds = probs.argmax(dim=1)
        targets = targets.argmax(dim=1)
        
        accuracy = preds.eq(targets).sum()
        accuracy = accuracy.get().float_precision()
        accuracy = 100 * (accuracy / batch_size)
        
        # 4). Backpropagate the loss
        loss.backward()

        # 5). Update weights
        optim.step()

        # decrypt the loss for logging
        loss = loss.get().float_precision()

        print(f"Ep {epoch} |  train loss: {loss}, train accuracy {accuracy:2} %")
        
        # Log to tensorboard
        writer.add_scalar('train/loss', loss, epoch * len(trainloader) + iter )
        writer.add_scalar('train/acc', accuracy, epoch * len(trainloader) + iter )

        
        # Perform validation on exactly one batch
        
        classifier.eval()

        for vectors, targets in valloader:
            
            
            probs, logits = classifier(vectors)

            loss = ((probs -  targets)**2).sum()

            preds = probs.argmax(dim=1)
            targets = targets.argmax(dim=1)

            accuracy = preds.eq(targets).sum()
            accuracy = accuracy.get().float_precision()
            accuracy = 100 * (accuracy / batch_size)

            loss = loss.get().float_precision()

            # Log to tensorboard
            writer.add_scalar('val/loss', loss, epoch * len(trainloader) + iter )
            writer.add_scalar('val/acc', accuracy, epoch * len(trainloader) + iter )
            
            break
            
writer.close()

Ep 0 |  train loss: 29.94300079345703, train accuracy 56.25 %
Ep 0 |  train loss: 26.466999053955078, train accuracy 40.625 %
Ep 0 |  train loss: 21.191999435424805, train accuracy 62.5 %
Ep 0 |  train loss: 22.15399932861328, train accuracy 50.0 %
Ep 0 |  train loss: 17.184999465942383, train accuracy 46.875 %
Ep 0 |  train loss: 16.148000717163086, train accuracy 43.75 %
Ep 0 |  train loss: 16.371000289916992, train accuracy 46.875 %
Ep 0 |  train loss: 16.305999755859375, train accuracy 31.25 %
Ep 0 |  train loss: 18.229999542236328, train accuracy 43.75 %
Ep 0 |  train loss: 15.805999755859375, train accuracy 53.125 %
Ep 0 |  train loss: 15.781000137329102, train accuracy 65.625 %
Ep 0 |  train loss: 18.469999313354492, train accuracy 34.375 %
Ep 0 |  train loss: 16.174999237060547, train accuracy 50.0 %
Ep 0 |  train loss: 15.84000015258789, train accuracy 53.125 %
Ep 0 |  train loss: 16.216999053955078, train accuracy 43.75 %
Ep 0 |  train loss: 16.097999572753906, train accuracy

Ep 0 |  train loss: 14.944000244140625, train accuracy 53.125 %
Ep 0 |  train loss: 14.97599983215332, train accuracy 62.5 %
Ep 0 |  train loss: 14.914999961853027, train accuracy 59.375 %
Ep 0 |  train loss: 15.480999946594238, train accuracy 65.625 %
Ep 0 |  train loss: 15.01200008392334, train accuracy 59.375 %
Ep 0 |  train loss: 17.40399932861328, train accuracy 37.5 %
Ep 0 |  train loss: 15.817000389099121, train accuracy 50.0 %
Ep 0 |  train loss: 14.430000305175781, train accuracy 68.75 %
Ep 0 |  train loss: 14.883999824523926, train accuracy 65.625 %
Ep 0 |  train loss: 14.574000358581543, train accuracy 56.25 %
Ep 0 |  train loss: 14.885000228881836, train accuracy 62.5 %
Ep 0 |  train loss: 14.894000053405762, train accuracy 62.5 %
Ep 0 |  train loss: 16.624000549316406, train accuracy 40.625 %
Ep 0 |  train loss: 13.61400032043457, train accuracy 62.5 %
Ep 0 |  train loss: 13.244999885559082, train accuracy 68.75 %
Ep 0 |  train loss: 16.072999954223633, train accuracy 50.0

Ep 0 |  train loss: 13.295999526977539, train accuracy 81.25 %
Ep 0 |  train loss: 14.036999702453613, train accuracy 71.875 %
Ep 0 |  train loss: 13.168000221252441, train accuracy 68.75 %
Ep 0 |  train loss: 13.329000473022461, train accuracy 68.75 %
Ep 0 |  train loss: 14.333000183105469, train accuracy 56.25 %
Ep 0 |  train loss: 13.812999725341797, train accuracy 71.875 %
Ep 0 |  train loss: 14.605999946594238, train accuracy 59.375 %
Ep 0 |  train loss: 12.965999603271484, train accuracy 84.375 %
Ep 0 |  train loss: 14.081999778747559, train accuracy 71.875 %
Ep 0 |  train loss: 12.467000007629395, train accuracy 87.5 %
Ep 0 |  train loss: 14.34000015258789, train accuracy 65.625 %
Ep 0 |  train loss: 14.072999954223633, train accuracy 53.125 %
Ep 0 |  train loss: 14.036999702453613, train accuracy 78.125 %
Ep 0 |  train loss: 13.47599983215332, train accuracy 75.0 %
Ep 0 |  train loss: 13.14900016784668, train accuracy 75.0 %
Ep 0 |  train loss: 14.170999526977539, train accurac

Ep 0 |  train loss: 12.39900016784668, train accuracy 87.5 %
Ep 0 |  train loss: 12.192999839782715, train accuracy 81.25 %
Ep 0 |  train loss: 12.590999603271484, train accuracy 75.0 %
Ep 0 |  train loss: 15.097999572753906, train accuracy 68.75 %
Ep 0 |  train loss: 11.579999923706055, train accuracy 71.875 %
Ep 0 |  train loss: 15.625, train accuracy 53.125 %
Ep 0 |  train loss: 14.303000450134277, train accuracy 59.375 %
Ep 0 |  train loss: 16.530000686645508, train accuracy 40.625 %
Ep 0 |  train loss: 13.562000274658203, train accuracy 65.625 %
Ep 0 |  train loss: 14.286999702453613, train accuracy 62.5 %
Ep 0 |  train loss: 12.111000061035156, train accuracy 75.0 %
Ep 0 |  train loss: 12.072999954223633, train accuracy 75.0 %
Ep 0 |  train loss: 13.434000015258789, train accuracy 75.0 %
Ep 0 |  train loss: 13.086999893188477, train accuracy 71.875 %
Ep 0 |  train loss: 13.692000389099121, train accuracy 68.75 %
Ep 0 |  train loss: 14.510000228881836, train accuracy 56.25 %
Ep 0 

Ep 0 |  train loss: 10.79699993133545, train accuracy 84.375 %
Ep 0 |  train loss: 14.086999893188477, train accuracy 65.625 %
Ep 0 |  train loss: 12.840999603271484, train accuracy 75.0 %
Ep 0 |  train loss: 12.395000457763672, train accuracy 78.125 %
Ep 0 |  train loss: 11.88700008392334, train accuracy 81.25 %
Ep 0 |  train loss: 10.258999824523926, train accuracy 81.25 %
Ep 0 |  train loss: 15.62399959564209, train accuracy 56.25 %
Ep 0 |  train loss: 14.260000228881836, train accuracy 65.625 %
Ep 0 |  train loss: 13.586999893188477, train accuracy 81.25 %
Ep 0 |  train loss: 13.404999732971191, train accuracy 59.375 %
Ep 0 |  train loss: 11.78499984741211, train accuracy 68.75 %
Ep 0 |  train loss: 14.78600025177002, train accuracy 59.375 %
Ep 0 |  train loss: 11.336000442504883, train accuracy 84.375 %
Ep 0 |  train loss: 12.062999725341797, train accuracy 78.125 %
Ep 0 |  train loss: 16.207000732421875, train accuracy 53.125 %
Ep 0 |  train loss: 13.515000343322754, train accura

Ep 0 |  train loss: 13.475000381469727, train accuracy 68.75 %
Ep 0 |  train loss: 11.463000297546387, train accuracy 81.25 %
Ep 0 |  train loss: 13.319999694824219, train accuracy 62.5 %
Ep 0 |  train loss: 11.02299976348877, train accuracy 75.0 %
Ep 0 |  train loss: 11.369000434875488, train accuracy 90.625 %
Ep 0 |  train loss: 11.991999626159668, train accuracy 78.125 %
Ep 0 |  train loss: 12.175000190734863, train accuracy 78.125 %
Ep 0 |  train loss: 13.151000022888184, train accuracy 75.0 %
Ep 0 |  train loss: 11.196999549865723, train accuracy 87.5 %
Ep 0 |  train loss: 11.704000473022461, train accuracy 75.0 %
Ep 0 |  train loss: 13.795999526977539, train accuracy 65.625 %
Ep 0 |  train loss: 13.63700008392334, train accuracy 68.75 %
Ep 0 |  train loss: 10.58899974822998, train accuracy 78.125 %
Ep 0 |  train loss: 12.687999725341797, train accuracy 75.0 %
Ep 0 |  train loss: 14.354000091552734, train accuracy 65.625 %
Ep 0 |  train loss: 11.032999992370605, train accuracy 87.

Ep 0 |  train loss: 13.776000022888184, train accuracy 62.5 %
Ep 0 |  train loss: 14.286999702453613, train accuracy 59.375 %
Ep 0 |  train loss: 12.454999923706055, train accuracy 65.625 %
Ep 0 |  train loss: 8.79699993133545, train accuracy 90.625 %
Ep 0 |  train loss: 11.770999908447266, train accuracy 81.25 %
Ep 0 |  train loss: 11.234000205993652, train accuracy 84.375 %
Ep 0 |  train loss: 12.206999778747559, train accuracy 75.0 %
Ep 0 |  train loss: 11.562999725341797, train accuracy 75.0 %
Ep 0 |  train loss: 13.293999671936035, train accuracy 59.375 %
Ep 0 |  train loss: 12.461000442504883, train accuracy 75.0 %
Ep 0 |  train loss: 12.618000030517578, train accuracy 75.0 %
Ep 0 |  train loss: 12.031000137329102, train accuracy 78.125 %
Ep 0 |  train loss: 11.347000122070312, train accuracy 71.875 %
Ep 0 |  train loss: 14.82699966430664, train accuracy 56.25 %
Ep 0 |  train loss: 11.836999893188477, train accuracy 78.125 %
Ep 0 |  train loss: 10.994999885559082, train accuracy 

Ep 0 |  train loss: 11.994000434875488, train accuracy 81.25 %
Ep 0 |  train loss: 10.520000457763672, train accuracy 75.0 %
Ep 0 |  train loss: 11.829000473022461, train accuracy 75.0 %
Ep 0 |  train loss: 12.47599983215332, train accuracy 75.0 %
Ep 0 |  train loss: 11.50100040435791, train accuracy 78.125 %
Ep 0 |  train loss: 11.38700008392334, train accuracy 75.0 %
Ep 0 |  train loss: 8.92199993133545, train accuracy 90.625 %
Ep 0 |  train loss: 9.008999824523926, train accuracy 87.5 %
Ep 0 |  train loss: 10.42300033569336, train accuracy 81.25 %
Ep 0 |  train loss: 11.366999626159668, train accuracy 78.125 %
Ep 0 |  train loss: 10.059000015258789, train accuracy 81.25 %
Ep 0 |  train loss: 12.241000175476074, train accuracy 78.125 %
Ep 0 |  train loss: 10.8149995803833, train accuracy 75.0 %
Ep 0 |  train loss: 10.729999542236328, train accuracy 78.125 %
Ep 0 |  train loss: 10.866000175476074, train accuracy 75.0 %
Ep 0 |  train loss: 11.539999961853027, train accuracy 78.125 %
Ep

Ep 0 |  train loss: 10.095999717712402, train accuracy 78.125 %
Ep 0 |  train loss: 11.16100025177002, train accuracy 81.25 %
Ep 0 |  train loss: 12.932999610900879, train accuracy 75.0 %
Ep 0 |  train loss: 9.605999946594238, train accuracy 84.375 %
Ep 0 |  train loss: 11.98900032043457, train accuracy 68.75 %
Ep 0 |  train loss: 10.98900032043457, train accuracy 84.375 %
Ep 0 |  train loss: 11.123000144958496, train accuracy 71.875 %
Ep 0 |  train loss: 9.166000366210938, train accuracy 90.625 %
Ep 0 |  train loss: 9.581000328063965, train accuracy 84.375 %
Ep 0 |  train loss: 13.3100004196167, train accuracy 75.0 %
Ep 0 |  train loss: 13.170000076293945, train accuracy 71.875 %
Ep 0 |  train loss: 12.35200023651123, train accuracy 75.0 %
Ep 0 |  train loss: 12.899999618530273, train accuracy 68.75 %
Ep 0 |  train loss: 10.265000343322754, train accuracy 90.625 %
Ep 0 |  train loss: 10.178999900817871, train accuracy 81.25 %
Ep 0 |  train loss: 11.722999572753906, train accuracy 78.1

Ep 1 |  train loss: 12.772000312805176, train accuracy 75.0 %
Ep 1 |  train loss: 9.880000114440918, train accuracy 84.375 %
Ep 1 |  train loss: 13.498000144958496, train accuracy 68.75 %
Ep 1 |  train loss: 10.402999877929688, train accuracy 81.25 %
Ep 1 |  train loss: 11.53600025177002, train accuracy 75.0 %
Ep 1 |  train loss: 10.131999969482422, train accuracy 71.875 %
Ep 1 |  train loss: 9.694000244140625, train accuracy 71.875 %
Ep 1 |  train loss: 11.800000190734863, train accuracy 78.125 %
Ep 1 |  train loss: 10.947999954223633, train accuracy 78.125 %
Ep 1 |  train loss: 10.642000198364258, train accuracy 81.25 %
Ep 1 |  train loss: 10.472000122070312, train accuracy 87.5 %
Ep 1 |  train loss: 10.414999961853027, train accuracy 81.25 %
Ep 1 |  train loss: 9.090999603271484, train accuracy 78.125 %
Ep 1 |  train loss: 9.536999702453613, train accuracy 81.25 %
Ep 1 |  train loss: 10.048999786376953, train accuracy 81.25 %
Ep 1 |  train loss: 9.548999786376953, train accuracy 87.

Ep 1 |  train loss: 11.07800006866455, train accuracy 75.0 %
Ep 1 |  train loss: 12.579000473022461, train accuracy 68.75 %
Ep 1 |  train loss: 8.883000373840332, train accuracy 87.5 %
Ep 1 |  train loss: 12.017999649047852, train accuracy 81.25 %
Ep 1 |  train loss: 9.618000030517578, train accuracy 71.875 %
Ep 1 |  train loss: 10.675999641418457, train accuracy 78.125 %
Ep 1 |  train loss: 13.402000427246094, train accuracy 62.5 %
Ep 1 |  train loss: 10.680000305175781, train accuracy 78.125 %
Ep 1 |  train loss: 10.430999755859375, train accuracy 78.125 %
Ep 1 |  train loss: 12.54800033569336, train accuracy 68.75 %
Ep 1 |  train loss: 11.234000205993652, train accuracy 71.875 %
Ep 1 |  train loss: 10.729999542236328, train accuracy 71.875 %
Ep 1 |  train loss: 11.36400032043457, train accuracy 75.0 %
Ep 1 |  train loss: 10.366000175476074, train accuracy 81.25 %
Ep 1 |  train loss: 10.831999778747559, train accuracy 71.875 %
Ep 1 |  train loss: 13.003999710083008, train accuracy 65

Ep 1 |  train loss: 10.265999794006348, train accuracy 78.125 %
Ep 1 |  train loss: 8.720000267028809, train accuracy 87.5 %
Ep 1 |  train loss: 10.788999557495117, train accuracy 84.375 %
Ep 1 |  train loss: 11.958000183105469, train accuracy 68.75 %
Ep 1 |  train loss: 11.479999542236328, train accuracy 81.25 %
Ep 1 |  train loss: 11.454000473022461, train accuracy 68.75 %
Ep 1 |  train loss: 10.616999626159668, train accuracy 81.25 %
Ep 1 |  train loss: 9.690999984741211, train accuracy 81.25 %
Ep 1 |  train loss: 9.187000274658203, train accuracy 75.0 %
Ep 1 |  train loss: 9.07800006866455, train accuracy 81.25 %
Ep 1 |  train loss: 12.232999801635742, train accuracy 65.625 %
Ep 1 |  train loss: 7.064000129699707, train accuracy 93.75 %
Ep 1 |  train loss: 12.310999870300293, train accuracy 71.875 %
Ep 1 |  train loss: 8.482999801635742, train accuracy 87.5 %
Ep 1 |  train loss: 9.309000015258789, train accuracy 81.25 %
Ep 1 |  train loss: 12.302000045776367, train accuracy 71.875 

Ep 1 |  train loss: 8.22599983215332, train accuracy 81.25 %
Ep 1 |  train loss: 11.352999687194824, train accuracy 78.125 %
Ep 1 |  train loss: 10.456000328063965, train accuracy 78.125 %
Ep 1 |  train loss: 12.105999946594238, train accuracy 65.625 %
Ep 1 |  train loss: 8.307999610900879, train accuracy 90.625 %
Ep 1 |  train loss: 9.553999900817871, train accuracy 84.375 %
Ep 1 |  train loss: 10.579000473022461, train accuracy 81.25 %
Ep 1 |  train loss: 9.227999687194824, train accuracy 90.625 %
Ep 1 |  train loss: 7.798999786376953, train accuracy 90.625 %
Ep 1 |  train loss: 11.831000328063965, train accuracy 78.125 %
Ep 1 |  train loss: 12.255000114440918, train accuracy 68.75 %
Ep 1 |  train loss: 10.96500015258789, train accuracy 75.0 %
Ep 1 |  train loss: 11.114999771118164, train accuracy 84.375 %
Ep 1 |  train loss: 11.788999557495117, train accuracy 68.75 %
Ep 1 |  train loss: 9.315999984741211, train accuracy 78.125 %
Ep 1 |  train loss: 12.300999641418457, train accuracy

Ep 1 |  train loss: 8.414999961853027, train accuracy 93.75 %
Ep 1 |  train loss: 12.130000114440918, train accuracy 78.125 %
Ep 1 |  train loss: 10.831999778747559, train accuracy 68.75 %
Ep 1 |  train loss: 11.5, train accuracy 84.375 %
Ep 1 |  train loss: 9.395000457763672, train accuracy 84.375 %
Ep 1 |  train loss: 8.418000221252441, train accuracy 81.25 %
Ep 1 |  train loss: 8.131999969482422, train accuracy 87.5 %
Ep 1 |  train loss: 11.006999969482422, train accuracy 65.625 %
Ep 1 |  train loss: 9.729999542236328, train accuracy 81.25 %
Ep 1 |  train loss: 9.722000122070312, train accuracy 78.125 %
Ep 1 |  train loss: 10.762999534606934, train accuracy 75.0 %
Ep 1 |  train loss: 10.265999794006348, train accuracy 81.25 %
Ep 1 |  train loss: 9.244999885559082, train accuracy 81.25 %
Ep 1 |  train loss: 11.98900032043457, train accuracy 68.75 %
Ep 1 |  train loss: 9.583999633789062, train accuracy 81.25 %
Ep 1 |  train loss: 9.85200023651123, train accuracy 81.25 %
Ep 1 |  train 

Ep 1 |  train loss: 8.92300033569336, train accuracy 84.375 %
Ep 1 |  train loss: 10.020000457763672, train accuracy 81.25 %
Ep 1 |  train loss: 11.883999824523926, train accuracy 75.0 %
Ep 1 |  train loss: 11.418000221252441, train accuracy 78.125 %
Ep 1 |  train loss: 8.901000022888184, train accuracy 84.375 %
Ep 1 |  train loss: 11.097000122070312, train accuracy 75.0 %
Ep 1 |  train loss: 10.190999984741211, train accuracy 81.25 %
Ep 1 |  train loss: 8.01099967956543, train accuracy 87.5 %
Ep 1 |  train loss: 9.618000030517578, train accuracy 84.375 %
Ep 1 |  train loss: 12.315999984741211, train accuracy 71.875 %
Ep 1 |  train loss: 9.074999809265137, train accuracy 78.125 %
Ep 1 |  train loss: 11.633000373840332, train accuracy 81.25 %
Ep 1 |  train loss: 8.871000289916992, train accuracy 81.25 %
Ep 1 |  train loss: 10.991999626159668, train accuracy 75.0 %
Ep 1 |  train loss: 11.602999687194824, train accuracy 68.75 %
Ep 1 |  train loss: 9.083999633789062, train accuracy 84.375 

Ep 1 |  train loss: 6.571000099182129, train accuracy 84.375 %
Ep 1 |  train loss: 13.59000015258789, train accuracy 68.75 %
Ep 1 |  train loss: 8.128999710083008, train accuracy 87.5 %
Ep 1 |  train loss: 9.928000450134277, train accuracy 75.0 %
Ep 1 |  train loss: 11.149999618530273, train accuracy 68.75 %
Ep 1 |  train loss: 11.505999565124512, train accuracy 81.25 %
Ep 1 |  train loss: 8.947999954223633, train accuracy 84.375 %
Ep 1 |  train loss: 8.616999626159668, train accuracy 87.5 %
Ep 1 |  train loss: 11.539999961853027, train accuracy 65.625 %
Ep 1 |  train loss: 11.175999641418457, train accuracy 81.25 %
Ep 1 |  train loss: 9.199999809265137, train accuracy 87.5 %
Ep 1 |  train loss: 7.63100004196167, train accuracy 90.625 %
Ep 1 |  train loss: 11.701000213623047, train accuracy 71.875 %
Ep 1 |  train loss: 12.152999877929688, train accuracy 68.75 %
Ep 1 |  train loss: 11.072999954223633, train accuracy 81.25 %
Ep 1 |  train loss: 11.180999755859375, train accuracy 75.0 %
E

Ep 1 |  train loss: 10.847999572753906, train accuracy 75.0 %
Ep 1 |  train loss: 9.11299991607666, train accuracy 84.375 %
Ep 1 |  train loss: 11.817999839782715, train accuracy 68.75 %
Ep 1 |  train loss: 8.449000358581543, train accuracy 90.625 %
Ep 1 |  train loss: 10.019000053405762, train accuracy 78.125 %
Ep 1 |  train loss: 9.11400032043457, train accuracy 78.125 %
Ep 1 |  train loss: 10.717000007629395, train accuracy 78.125 %
Ep 1 |  train loss: 12.800999641418457, train accuracy 62.5 %
Ep 1 |  train loss: 10.791000366210938, train accuracy 71.875 %
Ep 1 |  train loss: 8.256999969482422, train accuracy 90.625 %
Ep 1 |  train loss: 9.265000343322754, train accuracy 90.625 %
Ep 1 |  train loss: 9.109999656677246, train accuracy 78.125 %
Ep 1 |  train loss: 8.614999771118164, train accuracy 84.375 %
Ep 1 |  train loss: 14.439000129699707, train accuracy 62.5 %
Ep 1 |  train loss: 11.60099983215332, train accuracy 71.875 %
Ep 1 |  train loss: 12.687000274658203, train accuracy 62

Ep 2 |  train loss: 10.581000328063965, train accuracy 81.25 %
Ep 2 |  train loss: 8.744000434875488, train accuracy 84.375 %
Ep 2 |  train loss: 11.72700023651123, train accuracy 75.0 %
Ep 2 |  train loss: 8.809000015258789, train accuracy 84.375 %
Ep 2 |  train loss: 8.239999771118164, train accuracy 84.375 %
Ep 2 |  train loss: 12.812000274658203, train accuracy 68.75 %
Ep 2 |  train loss: 9.508999824523926, train accuracy 81.25 %
Ep 2 |  train loss: 9.809000015258789, train accuracy 78.125 %
Ep 2 |  train loss: 11.472999572753906, train accuracy 75.0 %
Ep 2 |  train loss: 10.755000114440918, train accuracy 78.125 %
Ep 2 |  train loss: 8.168000221252441, train accuracy 75.0 %
Ep 2 |  train loss: 10.824000358581543, train accuracy 71.875 %
Ep 2 |  train loss: 10.607000350952148, train accuracy 84.375 %
Ep 2 |  train loss: 11.100000381469727, train accuracy 62.5 %
Ep 2 |  train loss: 8.756999969482422, train accuracy 84.375 %
Ep 2 |  train loss: 8.175000190734863, train accuracy 78.12

Ep 2 |  train loss: 8.970000267028809, train accuracy 84.375 %
Ep 2 |  train loss: 12.305999755859375, train accuracy 68.75 %
Ep 2 |  train loss: 10.657999992370605, train accuracy 75.0 %
Ep 2 |  train loss: 9.758999824523926, train accuracy 84.375 %
Ep 2 |  train loss: 13.704000473022461, train accuracy 65.625 %
Ep 2 |  train loss: 9.038999557495117, train accuracy 84.375 %
Ep 2 |  train loss: 11.593000411987305, train accuracy 78.125 %
Ep 2 |  train loss: 7.8470001220703125, train accuracy 87.5 %
Ep 2 |  train loss: 8.005999565124512, train accuracy 75.0 %
Ep 2 |  train loss: 10.592000007629395, train accuracy 87.5 %
Ep 2 |  train loss: 11.281999588012695, train accuracy 81.25 %
Ep 2 |  train loss: 11.866000175476074, train accuracy 71.875 %
Ep 2 |  train loss: 11.126999855041504, train accuracy 68.75 %
Ep 2 |  train loss: 8.42199993133545, train accuracy 81.25 %
Ep 2 |  train loss: 8.968999862670898, train accuracy 78.125 %
Ep 2 |  train loss: 8.711999893188477, train accuracy 81.25

In [1]:
import spacy


In [3]:
nlp = spacy.load('en_core_web_sm')

In [45]:
doc = nlp("let's us thing so")

In [56]:
for token in doc:
    print(token.text, token.lemma_, token.is_oov, token.is_punct, token.orth_)

let let True False let
's -PRON- True False 's
us -PRON- True False us
thing thing True False thing
so so True False so


In [47]:
lexeme = nlp.vocab[doc[1].text]

In [48]:
lexeme.text, lexeme.norm_, lexeme.is_oov

("'s", "'s", True)

In [49]:
lexeme.vector

ValueError: [E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:
https://spacy.io/usage/models

### TODO:

1. **Create a pipe element to perform polarity cutoff:**

```
word_polarity = pass # dict of the form word: polarity
polarity_filter = PolarityFilter(data = word_polarity,
                                 max_pol = 1.3, 
                                 min_pol = -1.3, # tokens whos polarity between [min_pol, max_pol] 
                                                 # will be tagged 'in_doc_vectors' = False
                                 tag = 'in_doc_vector' # the tag in which to store the True of False result
                                )

# create an element that computes a doc vector
# only by considering the elements tagged as 
# sepcifified by 'tag'
doc_vectorizer = DocVectorizer(tag = 'in_doc_vector')

nlp.add_pipe(polarity_filter)
nlp.add_pipe(doc_vectorizer)
```

2. **create a module in SyferText where the Dataset object for IMDB movie reviews can be imported:**

(or DatasetIMDB can just return a list of sets, each is of the form [{'review': <string>, 'sentiment': int}, ...])
and then we create the dataset object out of this
    
```
from syfertext.datasets.imdb import DatasetIMDB
from torch.utils.data import DataLoader

# This loads the IMDB PyTorch Dataset object
# it will download the dataset, load it locally 
# then sends it to the specified workers 
# we can specify the proportion each worker takes
# of each of the classes. for instance
# bob here takes 30% of positive examples from the 
# original dataset and 20% of its negative examples
trainset = DatasetIMDB(bob, alice, 
                       dist = {'bob': {'pos' : 30, 'neg': 20},
                               'alice': {'pos' 70, 'neg': 50}},
                       mode = 'train'
                       pipes = [polarity_filter, doc_vectorizer]
                      )
                      
valset = DatasetIMDB(bob, alice, 
                     dist = {'bob': {'pos' : 50, 'neg': 50},
                             'alice': {'pos' 50, 'neg': 50}},
                     mode = 'val',
                     pipes = [polarity_filter, doc_vectorizer]
                    )
                            
# Create a torch data loaders
trainloader = DataLoader(trainset, shuffle = True,
                         batchsize = 16, workers = 2)
                  
valloader = DataLoader(valset, shuffle = False,
                         batchsize = 16, workers = 2)
     
```