Generate sentences to augment the dataset
-------------------------------------------------

In this notebook we will try to create a generative-adversarial network which will generate for us new sentences in order to augment the corpora size. We will use the `pytorch-lightning` module to improve the training fastness. 

- The generative model will understand the following characteristics:
    - we will provide the `size of the sequences` to a first model to generate a output of the same size that the given sequences
    - the output will be rounded in order to be transmit to the discriminator
    - it will use `leaky-relu` as activation function and batch normalization to avoid over-fitting
    - some rules will used on the decoded output in order to obtain the textual sentences

- The discriminative model will be used to verify if the output is close to the true sentences:
    - we will use for that a pre-trained BERT Model to discriminate of the output
    - we will tokenize the GAN inputs with a WordPiece tokenizer without normalizer because we want to generate texts


    

### Steps

The following steps will be required:

- Create a custom dataset to recuperate the sentences
- Create the generator
- Create the discriminator
- Create the GAN
- Train the model and evaluate it

### Create a custom dataset

Let us use the already trained tokenizer to recuperate the encoded sequences. Note that this dataset is different from that we want to use to train the translation model.

In [66]:
# %%writefile wolof-translate/wolof_translate/data/gan_dataset.py
import torch
import pandas as pd
from torch import nn
from tokenizers import Tokenizer
from torch.utils.data import Dataset

class SentenceDatasetGAN(Dataset):
    
    def __init__(self, file_path: str, corpus_1: str = "french_corpus", corpus_2: str = "wolof_corpus",
                 tokenizer_path: str = "wolof-translate/wolof_translate/tokenizers/adverse_tokenizer.json",
                 cls_token: str = "[CLS]", sep_token: str = "[SEP]", sep: str = ",", **kwargs):
        
        # let us recuperate the data frame
        self.__sentences = pd.read_csv(file_path, sep=sep, **kwargs)
        
        # let us recuperate the tokenizer
        self.__tokenizer = Tokenizer.from_file(tokenizer_path)
        
        # recuperate the first corpus' sentences
        self.__sentences_1 = self.__sentences[corpus_1].to_list()
        
        # recuperate the second corpus' sentences
        self.__sentences_2 = self.__sentences[corpus_2].to_list()
        
        # recuperate the special tokens
        self.cls_token = cls_token
        
        self.sep_token = sep_token
        
        # recuperate the length
        self.__length = len(self.__sentences_1)
        
        # let us recuperate the max len
        self.max_len = 0
        
        for i in range(self.__length):
            
            sentence = f"{self.cls_token}{self.__sentences_1[i]}{self.sep_token}{self.__sentences_2[i]}{self.sep_token}"
            
            encoding = self.__tokenizer.encode(sentence)
            
            if len(encoding.ids) > self.max_len:
                
                self.max_len = len(encoding.ids)    
        
    def __getitem__(self, index):
        
        sentence_1 = self.__sentences_1[index]
        
        sentence_2 = self.__sentences_2[index]
        
        # let us create the sentence with special tokens
        sentence = f"{self.cls_token}{sentence_1}{self.sep_token}{sentence_2}{self.sep_token}"
        
        # let us encode the sentence
        encoding = self.__tokenizer.encode(sentence)
        
        # it will return the padded ids and attention mask
        padding = self.max_len - len(encoding.ids)
        
        ids = torch.tensor(encoding.ids + [0] * padding)
        
        return ids, (ids > 0).float()
        
    def __len__(self):
        
        return self.__length

The data loader will generate the padded sequences of ids and the attention masks. Let us test it bellow.

In [67]:
dataset = SentenceDatasetGAN("data/extractions/new_data/sent_extraction.csv")

In [68]:
from torch.utils.data import DataLoader

# let us generate 10 sentences
ids, mask = next(iter(DataLoader(dataset, batch_size=10, shuffle=True)))

print("Ids:")
print(ids)

print("\nMask:")
print(mask)

Ids:
tensor([[   2, 1059,  688,  ...,    0,    0,    0],
        [   2, 1386,  704,  ...,    0,    0,    0],
        [   2,   51,  382,  ...,    0,    0,    0],
        ...,
        [   2,  909, 1606,  ...,    0,    0,    0],
        [   2,   36,    7,  ...,    0,    0,    0],
        [   2, 4825,   64,  ...,    0,    0,    0]])

Mask:
tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])


### Generator

The generator is a Multi Layers Perceptron with a number of features and layers specified as arguments. It will also require. A drop out rate can be specified to verify if the generator is over-fitting.

In [69]:
# %%writefile wolof-translate/wolof_translate/models/generative_model.py
from torch.nn import functional as F
from typing import *
from torch import nn

class GenerativeSequence(nn.Module):
    
    def __init__(self, 
                 input_dim,
                 num_features,
                 negative_slope: float = 0.01,
                 drop_out: float = 0.0,
                 eps: float = 0.00001,
                 momentum: float = 0.1):
        
        super(GenerativeSequence, self).__init__()
        
        self.batch_norm = nn.BatchNorm1d(input_dim, eps, momentum)
        
        self.linear = nn.Linear(input_dim, num_features)
        
        self.drop_out = nn.Dropout1d(drop_out)
        
        self.activation = nn.LeakyReLU(negative_slope)
        
        
    def forward(self, input_):
        
        out = self.batch_norm(input_)
        
        out = self.activation(self.drop_out(self.linear(out)))
        
        return out

class SentenceGenerativeNet(nn.Module):
    
    def __init__(self, 
                 latent_dim: int,
                 output_size: int,
                 num_features: Union[int, List] = 300,
                 num_layers: int = 5,
                 negative_slope: float = 0.01,
                 drop_out: float = 0.0,
                 eps: float = 0.00001,
                 momentum: float = 0.1):
        
        super(SentenceGenerativeNet, self).__init__()
        
        self.latent_dim = latent_dim
        
        self.num_features = [num_features] * num_layers if type(num_features) is int else num_features
        
        assert len(self.num_features) == num_layers
        
        self.num_layers = num_layers
        
        self.output_size = output_size
        
        self.sequences = nn.ModuleList()
        
        self.sequences.append(GenerativeSequence(latent_dim, self.num_features[0], negative_slope, drop_out, eps, momentum))
        
        for l in range(1, num_layers):
            
            self.sequences.append(GenerativeSequence(self.num_features[l-1], self.num_features[l], negative_slope, drop_out, eps, momentum))
        
        self.output_layer = nn.Linear(self.num_features[-1], output_size)
        
    def forward(self, input_):
        
        out = input_
        
        for sequence in self.sequences:
            
            out = sequence(out)
            
        return self.output_layer(out)
        

Let us test our generative model with dummy input.

In [71]:
generative_model = SentenceGenerativeNet(latent_dim = 300, output_size=dataset.max_len)

In [78]:
generative_model(torch.randn((10, 300))).round().size()

torch.Size([10, 379])