# Sentiment classifier
Tutorial from [Analytics Vidhya](https://www.analyticsvidhya.com/blog/2019/11/comprehensive-guide-attention-mechanism-deep-learning/) by Prodip Hore and Sayan Chatterjee

## Dataset

UCI Machine Learning Repository: Sentiment Labelled Sentences Data Set
('From Group to Individual Labels using Deep Features', Kotzias et. al,. KDD 2015)

Sentences: 2000 (We are only using Amazon and Yelp files)

Labels: Positive (1) - Negative (0)


Example:

* "The mic is great." Positive ->  `The mic is great.	1`

* "What a waste of money and time!." Negative -> `What a waste of money and time!.	0`


## Architecture

Input layer -> Embedding layer -> LSTM -> Dense (softmax) -> Label

In [None]:
import numpy as np

# Read txt files
with open('data/amazon.txt', mode='r') as f:
    lines = f.readlines()
    
with open('data/yelp.txt', mode='r') as f:
    lines += f.readlines()

# Split lines so we have sentences and the class as an integer
sentences = [line.split('\t')[0] for line in lines]
labels = [int(line.split('\t')[1]) for line in lines]
labels = np.asarray(labels)
print(len(labels))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenizer: An object with an internal lexicon, and unknown token.
t = Tokenizer()
# Load the dataset in the tokenizer
t.fit_on_texts(sentences)

# Maps the words in the sentences with the indeces in the lexicon (list of lists)
text_matrix= t.texts_to_sequences(sentences)

print('sentence: ' + sentences[0])

print('representation: ')
print(text_matrix[0])


# calculate max length of sentence in the corpus
max_length = 0

for i in range(len(text_matrix)):
    sent_length = len(text_matrix[i])
    if max_length < sent_length:
        max_length = sent_length
    
print('max length: %d' % max_length)

# The vocabulary size will be determine by the index of the last word in the lexicon (index starting from 0)
vocab_size = len(t.word_index) + 1

print('vocabulary size: %d'%vocab_size)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# dimension of the embeddings to represent the words with vectors of the same dimension. 
emb_dim = 16

# we need to pad the sentences that have less words than the maximum length by adding zeros
tex_pad = pad_sequences(text_matrix, maxlen=max_length, padding='post')

# Dummy train test sets split 
x_train = tex_pad[:1600,:]
y_train = labels[:1600]
x_test = tex_pad[1600:,:]
y_test = labels[1600:]

print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer, GlobalAveragePooling1D, Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

# Custom attetion layer
class BahdanauAttention(Layer):
    def __init__(self, **kwargs):
        super(BahdanauAttention, self).__init__(**kwargs)
    
    # This method states the weights that the layer will learn. It has as input param the shape of the input
    # which is called. This method is called at the declaration time
    def build(self, input_shape):
        # We need to provide the dimensions of our weights. In this example, we will have a W_a matrix of
        # dimension (lstm_units, 1), and a bias of dimension (max_length, 1)
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(BahdanauAttention, self).build(input_shape)
    
    # In this method with do all the calculations of the layer and return the output of the layer
    def call(self, x):
        # x is the input of the layer. In this example, the output of lstm (hidden_statesxlstm_units) 
        # hidden_states = max_length
        
        # We calculate the score tanh(W.x + b)
        scores = K.tanh(K.dot(x,self.W)+self.b)  # (max_length x 1) 
        print('scores shape: ')
        print(scores.shape)
        
        # This removes the last axis -> a vector of max_length dimension 
        # we can omit this since our W matrix has dimension 1 in the last axis
        scores=K.squeeze(scores, axis=-1) 
        print('scores shape after squeeze: ')
        print(scores.shape)
        
        # we apply softmax (the last axis is the default axis used for calculation)
        at=K.softmax(scores)
        print('attention weights shape: ')
        print(at.shape)
        
        # This adds a 1-sized dimension to the last axis -> matrix of (max_length x 1)
        at=K.expand_dims(at,axis=-1) # if there is no squeeze, then we can omit this
        print('attention weights shape after expand_dims: ')
        print(at.shape)
        
        # We calculate the weighted values -> \alpha*hidden_states         
        # row-wise multiplication (we are weighting the hidden_states, not the lstm_units) 
        output=x*at # (max_length x lstm_units)
        print('weighted values shape: ')
        print(output)
        
        # The output of this layer is the weighted values (we sum up the values of the hidden states), and
        # the weights of the attetnion (max_length x 1)
        return K.sum(output, axis=1), at
    
    # This is used for summary, to see the output shape of the two output matrices
    def compute_output_shape(self, input_shape):
        return (input_shape[0],input_shape[-1])
    
    # This is used for summary (it returns the params of the layer)
    def get_config(self):
        return super(BahdanauAttention, self).get_config()


# Architecture
lstm_units = 10

inputs = Input(shape=(max_length,))
embedding = Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=max_length, embeddings_regularizer=l2(.001))
embd_out = embedding(inputs)
lstm = LSTM(lstm_units, dropout=0.3, recurrent_dropout=0.2, return_sequences=True)
lstm_out = lstm(embd_out)

weigthed_out, weights = BahdanauAttention()(lstm_out)

prob = Dense(2, activation='sigmoid')
outputs = prob(weigthed_out)

model = Model(inputs, outputs) # classifier
attention_model = Model(inputs, weights) # attention weights


print(model.summary())

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])


model.fit(x=x_train,y=y_train,
          batch_size=100,
          epochs=10,
          verbose=1,
          shuffle=True,
          validation_data=(x_test,y_test)
         )

In [None]:
# The model barely learnt. Results change with each execusion
# acc_train = 0.56 last epoch vs acc_train = 0.52 first epoch
# acc_test = 0.45 last epoch vs acc_test = 0.41 for first epoch
# Test
print(t.sequences_to_texts(x_test[:10]))
print(y_test[:10])

pred = model.predict(x_test[:10])
print(pred)

## Architecture

Input layer -> Embedding block -> Transforemer Block -> GlobalAveragePooling -> Dropout ->Dense (reduce dimensions) -> Dropout -> Dense (softmax) -> Label

### Steps

1. Create the Embedding and Transfomer block (copy code from text_classifier_transformer)
1. Create classifier model. This time, to have a model size similar to the LSTM model, used embedding size of 16 amd ffn dimension of 10
1. Use the same training parameters as before (epochs, batch, etc) and compare behaviours



In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Layer, Embedding, LayerNormalization, Dropout
from tensorflow.keras.models import Sequential

# TokenAndPositionEmbedding class

# MultiHeadSelfAttention class

# TransformerBlock class


In [None]:
from tensorflow.keras.layers import Input, GlobalAveragePooling1D
from tensorflow.keras.models import Model

# define the classifier model

In [None]:
# train model


In [None]:
# test model predictions

# pred = model_transformer.predict(x_test)

"""
print(x_test[1])
print(t.sequences_to_texts(x_test[1:2]))
print(pred[1])
print(y_test[1])
"""