In [4]:
# Import Libraries
import anvil.server
anvil.server.connect("WWR7EBB6FQN3HDIEDA6TGUXJ-5AVW2RHVQQ7H7HHW")
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import torch.nn as nn
import torch.optim as Opt
import time
import spacy

 
# ///////////////////////////////////////////////////////////////////////////////////
# Final Sentiment Analyser
# ///////////////////////////////////////////////////////////////////////////////////

# First section

# This sentiment analyser will work to obtain digital online texts 
# and analyse the texts to understand the emotion or "sentiment" that they contain

# Text Analyser

# Sets the access key
AccessKey = 5678

# Will assign the value of the variable AccessKey to the feature "manual_seed"
torch.manual_seed(AccessKey)
torch.backends.cudnn.deterministic = True

# Begin Tokenisation
txt = data.Field(tokenize = "spacy", tokenizer_language = "en_core_web_sm", include_lengths = True)
# The txt variable will be used as a means of defining how the text should be processed
# The lbl variable will be used as a means of processing the given sentiment
lbl = data.LabelField(dtype = torch.float)

# This will download the IMDB dataset and split it up into torchtext.datasets objects
Data_Trained, Data_Test = datasets.IMDB.splits(txt, lbl)

# By analysing the length, we can find out how many examples are contained in each split
print(f'Amount of examples in training: {len(Data_Trained)}')
print(f'Amount of examples in testing: {len(Data_Test)}')

# Display data from the IMDb database

print(vars(Data_Trained.examples[0]))

# The variables Data_Trained and Data_Valid are declared and used to contain their respected values
Data_Trained, Data_Valid = Data_Trained.split(random_state = random.seed(AccessKey))

print(f'Amount of examples in training: {len(Data_Trained)}')
print(f'Amount of examples in validation: {len(Data_Valid)}')
print(f'Amount of examples in testing: {len(Data_Test)}')

# Sets a maximum vocabulary size for the sentiment analyzer
Maximum_Size_Of_Vocabulary = 30_000

# Sets the vocabulary build, containing the variables for the trained data and maximum size to contain the value
# of the variable Maximum_Size_Of_Vocabulary
txt.build_vocab(Data_Trained, max_size = Maximum_Size_Of_Vocabulary, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
lbl.build_vocab(Data_Trained)

# Display the number of tokens that are unique in the txt and lbl variables
print(f"Number of tokens that are unique within the txt vocabulary: {len(txt.vocab)}")
print(f"Number of tokens that are unique within the lbl vocabulary: {len(lbl.vocab)}")

# Display the words that occur most frequently in the vocabulary including the frequencies that they have
print(txt.vocab.freqs.most_common(30))

# This will directly display the vocabulary
print(txt.vocab.itos[:20])

# This will analyse the labels and make sure that positive is represented by 1 and negative is represented by 0
print(lbl.vocab.stoi)

# Place tensors on the GPU that the iterator has returned
# Sets a batch size
Size_Of_Batch = 64

# Sets an if statement which will return CUDA if it is available, otherwise, it will use the CPU instead
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# The variable Device is then passed on to the iterator
Iterator_Trained, Iterator_Valid, Iterator_Test = data.BucketIterator.splits(
    (Data_Trained, Data_Valid, Data_Test),
    batch_size = Size_Of_Batch, sort_within_batch = True,
    device = device)

# For this part, a model is built, which will be trained and evaluated
# The class has been defined as a recurrent neural network as they are often utilised as a means of analysing sequences
class RecurrentNeuralNetwork(nn.Module):
  # A function is defined which includes a number of variables such as the Input, Embedding, Hidden, Output etc
  # The _init_ will consist of the various layers for the model
  def _init_(self, sizeofvocab, Dim_Embedding, Dim_Hidden, Dim_Output, Layers_Num, 
             Bidirectional, Dropout, Pad_Idx):

    super()._init_()

  # Will embedd a range of values such as the size of the vocabulary, dim embedding and the pad idx
    self.embedding = nn.Embedding(sizeofvocab, Dim_Embedding, Padding_Idx = Pad_Idx)

    self.rnn = nn.LSTM(Dim_Embedding, 
                       Dim_Hidden, 
                       Layers_N = Layers_Num, 
                       Birectional = Bidirectional, 
                       Dropout = Dropout)

    self.fc = nn.linear(Dim_Hidden * 2, Dim_Output)

    self.Dropout = nn.dropout(Dropout)

  def Forward(self, txt, Lengths_Text):
      # txt = [Length of text sent, Size of batch sent]

      Embedded = self.Dropout(self.Embedding(txt))
      # Embedded = [Length of text sent, Size of batch sent, dim embedded]
      
      # Sequence Pack
      Embedded_Pack = nn.utils.rnn.pack_padded_sequence(Embedded, Lengths_Text.to('cpu'))
      Output_Pack, (Hidden, Cell) = self.rnn(Embedded_Pack)
 
      # Sequence Unpack
      Output, Lengths_Output = nn.utils.rnn.pad_packed_sequence(Output_Pack)

      # Output = [length sent, size of batch, dim hidden * directions num]
      
      # Hidden = [layers num * directions num, size of batch, dim hidden]
      # Cell = [layers num * directions num, size of batch, dim hidden]

      # Concatenation of final forward (Hidden[-2,:,:]) and backward (Hidden[-1,:,:]) layers hidden
      # plus applying Dropout

      Hidden = self.Dropout(torch.cat((Hidden[-2,:,:], Hidden[-1,:,:]), Dim = 1))
      # Output = [Length of text sent, Size of batch sent, dim hidden]
      # Hidden = [size of batch, dim hidden * directions num]

      return self.fc(Hidden)

      # The input field will be equal to the size of the vocabulary
      # The embedding field will be the size of word vectors that are dense
      # The hidden field will be the size of the hiiden sections
      # The output field is generally the amount of classes
      DIM_INPUT = len(TXT.vocab)
      DIM_EMBEDDING = 100
      DIM_HIDDEN = 256
      DIM_OUTPUT = 1
      LAYERS_NUM = 2
      BIDIRECTIONAL = True
      DROPOUT = 0.5
      PAD_IDX = TXT.vocab.stoi[TXT.Token_Pad]
      # Sets the model to contain the class RecurrentNeuralNetwork with all of its variables 
      Model = RecurrentNeuralNetwork(DIM_INPUT, 
                                     DIM_EMBEDDING, 
                                     DIM_HIDDEN, 
                                     DIM_OUTPUT, 
                                     LAYERS_NUM, 
                                     BIDIRECTIONAL, 
                                     DROPOUT, 
                                     PAD_IDX)

      # Function to contain the number of parameters within the analyser
  def ParameterCount(Model):
    # Will return the variable Total
      return sum(p.Numel() for p in Model.Parameters() if p.NeedsGrad)
      # Will print out a message indicating the number of training parameters which the model contains
      print(f'The model contains {ParameterCount(Model):,} parameters training')
  # Pretrained word embeddings will then be copied into the embedding layer 
  # Once the embeddings from the vocab have been gathered, they will then be checked to ensure that they are of the correct size
      Embeddings_Pretrained = TXT.vocab.vectors
      print(Embeddings_Pretrained.shape)

  # The initial weights of the embedding layer are then replaced with embeddings that are pretrained
      Model.Embedding.weight.data.copy_(Embeddings_Pretrained)
  
      Unk_Idx = TXT.vocab.stoi[TXT.unktoken]
  # The row of the embedding weights matrix will be set to zero. The row will be discovered through locating the tokens index 
      Model.Embedding.weight.data[Unk_Idx] = torch.zeros(Dim_Embedding)
      Model.Embedding.weight.data[Pad_Idx] = torch.zeros(Dim_Embedding)

      print(Model.Embedding.weight.data)
     # The model will now be prepared for training
     # The optimizer will be used in order to update the modules parameters
     # The feature 'Adam' will adapt the rate of learning for each of the parameters, providing paramters which are updated regularly
     # with a learning rate that is lower whilst providing parameters that are updated less often with a learning rate that is higher
      Optimizer = Opt.Adam(Model.Parameters())
     # The criterion will be used as the loss function
     # In this case, the Binary Cross Entropy with Logits will act as the loss function
     # The Binary_Cross_Entropy_With_Logits() will carry out both the Binary Cross Entropy steps as well as the sigmoid
      Criterion = nn.Binary_Cross_Entropy_With_Logits()
     # Through the use of .to the model and criterion can be placed onto the GPU
      Model = Model.to(device)
      Criterion = Criterion.to(device)

      # This function will calculate the accuracy
  def BinaryAccuracy(Prediction, x):
      # This will round the relevant predictions to the nearest integer
      # Will return the accuracy of each of the batches
      Predictions_Rounded = torch.Round(torch.sigmoid(Prediction))
      # The function will pass the values through a sigmoid layer
      # It will then produce a calculation of the number of predictions which equal the labels 
      # and give an average through the batch
      Correct = (Predictions_Rounded == x).float()
      Acc = Correct.Total() / len(Correct)
      # Will return the variable Acc
      return Acc

       # This function will iterate over each example, going over a batch at a time
  def Model_Train(Model, Criterion, Iterator, Optimizer):  

      # Will define two variables, Loss_Of_Epoch and Acc_Epoch
      Loss_Of_Epoch = 0
      Acc_Epoch = 0

      # The model.train will be utilised as a means of putting the model into "training mode"
      # This will activate both, dropout and batch normalisation
      Model.Model_Train()

      # Will set a for loop that will loop over the batches
      for batch in Iterator:

        # With each of the batches, the gradient will be zero 
        # Every parameter will contain a grad attribute that will be used to store the gradient that has been 
        # calculated through the use of the criterion
        Optimizer.zero_grad()
        txt, Lengths_Text = batch.text        
        # The batch will then be given a set of sentences via Batch.txt that will be placed within the model
        # The squeeze will be used to get rid of the size 1 dimension
        Predict = Model(txt, Lengths_Text).squeeze(1)
        # The Batch.lbl will then calculate the loss and accuracy
        Loss = Criterion(Predict, batch.lbl)
        Acc = BinaryAccuracy(Predict, batch.lbl)
        # Loss.Backward() will then produce a calculation of the gradient of each parameter
        # the parameters will then be updated through using the gradients as well as the optimizer algorithm with Optimizer.step()
        Loss.backward()
        Optimizer.step()
        # The .item() method is utilised as a means of extracting a scalar through a tensor that holds only one value
        Loss_Of_Epoch += Loss.item()
        Acc_Epoch += Acc.item()

      # This will return both, the loss and the accuracy that has been averaged over the epoch
      # the len will be the amount of batches that the iterator has
      return Loss_Of_Epoch / len(Iterator), Acc_Epoch / len(Iterator)

      # This will set an evaluation function
  def Eval(Model, Criterion, Iterator):
    
      # Declares two variables, Loss_Of_Epoch and Acc_Epoch and will set them to the value of 0
      Loss_Of_Epoch = 0;
      Acc_Epoch = 0;

      # The model.eval() will place the model into "evalutation mode"
      # This will deactivate dropout as well as batch normalisation
      Model.eval()

      # The with no_grad() will stop gradients from being calculated on PyTorch operations within this field
      with torch.no_grad():
        
        # Sets a for loop to iterate throughout the batch
        for batch in Iterator:

          # Wil assign various variables to equal certain values          
          txt, Lengths_Text = batch.Text

          # The variable Predict will be given the value of model, 
          # Loss will be given the value of Criterion and Acc the value of BinaryAccuracy
          Predict = Model(txt, Lengths_Text).squeeze(1)

          Loss = Criterion(Predict, batch.lbl)

          Acc = BinaryAccuracy(Predict, batch.lbl)

          Loss_Of_Epoch += Loss.item()
          Acc_Epoch += Acc.item()

      # Will return the value of the variable Loss_Of_Epoch and Acc_Epoch when being divided by the length of
      # the iterator
      return Loss_Of_Epoch / len(Iterator), Acc_Epoch / len(Iterator)


      # Sets a function to find out the length of time which an epoch will take when comparing various training times
      # between models
  def Time_Epoch(Time_Begins, Time_Ends):
      # These will be used as a means of training the models from a number of epochs 
      # An epoch will be treated as a whole pass through every example in both, the training and validation sets
      Time_Elapsed = Time_Ends - Time_Begins
      Minutes_Elapsed = int(Time_Elapsed / 60)
      Seconds_Elapsed = int(Time_Elapsed - (Minutes_Elapsed * 60))
      # Will return the variables "Minutes_Elapsed" and "Seconds_Elapsed"
      return Minutes_Elapsed, Seconds_Elapsed

      # The model is then trained
      Epochs_N = 5

      GreatestLoss_Valid = float('inf')

      for Epoch in Range(Epochs_N):

        Time_Begins = time.time()

        Loss_Train, Acc_Train = Model_Train(Model, iterator_train, Optimizer, Criterion)
        Loss_Valid, Acc_Valid = Eval(Model, iterator_valid, Criterion)

        Time_Ends = time.time()
        Minutes_Epochs, Seconds_Epochs = Time_Epoch(Time_Begins, Time_Ends)

        # If statement which will assign the variable "GreatestLoss_Valid" to that of the variable "Loss_Valid" in the event 
        # that the variable "Loss_Valid" is less than the variable "GreatestLoss_Valid"
        if Loss_Valid < GreatestLoss_Valid:
          GreatestLoss_Valid = Loss_Valid
          torch.save(model.state_dict(), 'TUT2-model.pt')

      print(f'Epoch: {Epoch+1:00} | Time_Epoch: {Minutes_Epochs}m {Seconds_Epochs}s')
      print(f'\tModel_Train Loss: {Loss_Train:.3f} | Acc_Train: {Acc_Train*100:.2f}%')
      print(f'\t Value. Loss: {Loss_Valid:.3f} | Value. Acc: {Acc_Valid*100:.2f}%')

      Model.State_Load_Dict(torch.Load("TUT2-Model.pt"))

      Loss_Test, Acc_Test = evaluate(Model, Criterion, iterator_test)

      print(f'Loss Test: {Loss_Test:.3f} | Acc_Test: {Acc_Test*100:.2f}%')


      nlp = spacy.load('en_core_web_sm')
     

  @anvil.server.callable 
       # This will define a function that will calculate a prediction for the sentiment
  def Sentiment_Predictor(Model, Sentence):
      Model.eval()
      Tokenized = [tok.txt for tok in nlp.Tokenizer(Sentence)]
      Indexed = [txt.vocab.stoi[T] for T in Tokenized]
      Length = [len(Indexed)]
      Tensor = torch.LongTensor(Indexed).to(device)
      Tensor = Tensor.unsqueeze(1)
      TensorLength = torch.LongTensor(Length)
      Predict = torch.sigmoid(Model(Tensor, TensorLength))
      # Will return a sentiment prediction for the given text it has been provided with
      return Predict.item()

      Sentiment_Predictor(Model, "This film is terrible")

      Sentiment_Predictor(Model, "This film is good")

      anvil.server.wait_forever()





Amount of examples in training: 25000
Amount of examples in testing: 25000
{'text': ['Peaches', 'is', 'truly', 'a', 'marvelous', 'film', '.', 'I', 'write', 'this', 'to', 'refute', 'a', 'review', 'from', 'someone', 'called', "'", 'Auscrit', "'", ',', 'that', 'has', 'appeared', 'on', 'this', 'site', '.', 'First', 'of', 'all', 'the', 'idea', 'that', 'either', 'Monahans', 'first', 'film', "'", 'The', 'Interview', "'", 'is', 'somehow', 'TV', 'is', 'an', 'extraordinary', 'statement', '.', 'Here', 'is', 'a', 'film', 'that', 'has', 'been', 'significantly', 'praised', 'around', 'the', 'world', 'as', 'is', 'simply', 'one', 'of', 'the', 'best', 'Australian', 'Films', 'ever', 'made', '.', 'It', 'fully', 'deserved', 'to', 'win', 'best', 'picture', '.', 'Peaches', 'is', 'a', 'brave', ',', 'bold', 'and', 'courageous', 'departure', '.', 'For', 'me', 'it', 'works', 'on', 'every', 'level', 'and', 'I', 'have', 'now', 'seen', 'it', 'twice', '.', 'Monahan', 'is', 'a', 'filmmaker', 'who', 'is', 'demonstrati

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

# @anvil.server.callable
def Sentiment_Predictor(Model, Sentence):
  Model.eval()
  Tokenized = [tok.txt for tok in nlp.Tokenizer(Sentence)]
  Indexed = [txt.vocab.stoi[T] for T in Tokenized]
  Length = [len(Indexed)]
  Tensor = torch.LongTensor(Indexed).to(device)
  Tensor = Tensor.unsqueeze(1)
  TensorLength = torch.LongTensor(Length)
  Predict = torch.sigmoid(Model(Tensor, TensorLength))
  return Predict.item()