In [None]:
! pip install -q gluonnlp mxnet

[K     |████████████████████████████████| 348kB 2.6MB/s 
[K     |████████████████████████████████| 55.0MB 74kB/s 
[?25h  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone


In [None]:
import re
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, autograd
from mxnet.gluon import nn, rnn, Trainer
from mxnet.gluon.loss import SigmoidBinaryCrossEntropyLoss 
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import files
uploaded = files.upload()

Saving AMAZON-REVIEW-DATA-CLASSIFICATION.csv to AMAZON-REVIEW-DATA-CLASSIFICATION.csv


In [None]:
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['AMAZON-REVIEW-DATA-CLASSIFICATION.csv']))


In [None]:
df["isPositive"].value_counts()

1.0    43692
0.0    26308
Name: isPositive, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   reviewText  69989 non-null  object 
 1   summary     69986 non-null  object 
 2   verified    70000 non-null  bool   
 3   time        70000 non-null  int64  
 4   log_votes   70000 non-null  float64
 5   isPositive  70000 non-null  float64
dtypes: bool(1), float64(2), int64(1), object(2)
memory usage: 2.7+ MB


In [None]:
df.describe()

Unnamed: 0,time,log_votes,isPositive
count,70000.0,70000.0,70000.0
mean,1370112000.0,0.535257,0.624171
std,114998600.0,0.962677,0.48434
min,942192000.0,0.0,0.0
25%,1322870000.0,0.0,0.0
50%,1406160000.0,0.0,1.0
75%,1448669000.0,1.098612,1.0
max,1538438000.0,7.110696,1.0


In [None]:
df.isna().sum()

reviewText    11
summary       14
verified       0
time           0
log_votes      0
isPositive     0
dtype: int64

In [None]:
train_text, val_text, train_label, val_label = \
    train_test_split(df["reviewText"].tolist(),
                     df["isPositive"].tolist(),
                     test_size=0.10,
                     shuffle=True,
                     random_state=360)

</br>
Text cleaning: Simple text cleaning operations. We won't do stemming or lemmatization as our word vectors already cover different forms of words. We are using GloVe word embeddings for 6 billion words, phrases or punctuations in this example.
</br>
Tokenization: Tokenizing all sentences
</br>
Creating vocabulary: We will create a vocabulary of the tokens. In this vocabulary, tokens will map to unique ids, such as "car"->32, "house"->651, etc.
</br>
Transforming text: Tokenized sentences will be mapped to unique ids. For example: ["this", "is", "sentence"] -> [13, 54, 412].
</br>

In [None]:
import nltk, gluonnlp
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def cleanStr(text):
    
    # Check if the sentence is a missing value
    if isinstance(text, str) == False:
        text = ""
            
    # Remove leading/trailing whitespace
    text = text.lower().strip()
    # Remove extra space and tabs
    text = re.sub('\s+', ' ', text)
    # Remove HTML tags/markups
    text = re.compile('<.*?>').sub('', text)
    return text

def tokenize(text):
    tokens = []
    text = cleanStr(text)
    words = word_tokenize(text)
    for word in words:
        tokens.append(word)
    return tokens

def createVocabulary(text_list, min_freq):
    all_tokens = []
    for sentence in text_list:
        all_tokens += tokenize(sentence)
    # Calculate token frequencies
    counter = gluonnlp.data.count_tokens(all_tokens)
    # Create the vocabulary
    vocab = gluonnlp.Vocab(counter,
                           min_freq = min_freq,
                           unknown_token = '<unk>',
                           padding_token = None,
                           bos_token = None,
                           eos_token = None)
    
    return vocab

def transformText(text, vocab, max_length):
    token_arr = np.zeros((max_length,))
    tokens = tokenize(text)[0:max_length]
    for idx, token in enumerate(tokens):
        try:
            # Use the vocabulary index of the token
            token_arr[idx] = vocab.token_to_idx[token]
        except:
            token_arr[idx] = 0 # Unknown word
    return token_arr

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
min_freq = 5
max_length = 250

print("Creating the vocabulary")
vocab = createVocabulary(train_text, min_freq)
print("Transforming training texts")
train_text_transformed = nd.array([transformText(text, vocab, max_length) for text in train_text])
print("Transforming validation texts")
val_text_transformed = nd.array([transformText(text, vocab, max_length) for text in val_text])

Creating the vocabulary
Transforming training texts
Transforming validation texts


In [None]:
print("Vocabulary index for computer:", vocab['computer'])
print("Vocabulary index for beautiful:", vocab['beautiful'])
print("Vocabulary index for code:", vocab['code'])

Vocabulary index for computer: 67
Vocabulary index for beautiful: 1976
Vocabulary index for code: 402


In [None]:
from mxnet.contrib import text
glove = text.embedding.create('glove',
                              pretrained_file_name = 'glove.6B.50d.txt')
embedding_matrix = glove.get_vecs_by_tokens(vocab.idx_to_token)

Downloading /root/.mxnet/embeddings/glove/glove.6B.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.zip...


In [None]:
# Size of the state vectors
hidden_size = 12

# General NN training parameters
learning_rate = 0.01
epochs = 15
batch_size = 32

# Embedding vector and vocabulary sizes
num_embed = 50 # glove.6B.50d.txt
vocab_size = len(vocab.token_to_idx.keys())

In [None]:
from mxnet.gluon.data import ArrayDataset, DataLoader

train_label = nd.array(train_label)
val_label = nd.array(val_label)

train_dataset = ArrayDataset(train_text_transformed, train_label)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
context = mx.cpu() # use mx.gpu() if you are using GPU

model = nn.Sequential()
model.add(nn.Embedding(vocab_size, num_embed), # Embedding layer
          rnn.RNN(hidden_size, num_layers=1),  # Recurrent layer
          nn.Dense(1, activation='sigmoid'))   # Output layer

In [None]:
# Initialize networks parameters
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

# We set the embedding layer's parameters from GloVe
model[0].weight.set_data(embedding_matrix.as_in_context(context))
# We won't change/train the embedding layer
model[0].collect_params().setattr('grad_req', 'null')

In [None]:
# Setting our trainer
trainer = Trainer(model.collect_params(),
                        'sgd',
                        {'learning_rate': learning_rate})

# We will use Binary Cross-entropy loss
cross_ent_loss = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)

In [None]:
import time
for epoch in range(epochs):
    start = time.time()
    training_loss = 0
    # Training loop, train the network
    for idx, (data, target) in enumerate(train_loader):

        data = data.as_in_context(context)
        target = target.as_in_context(context)
        
        with autograd.record():
            output = model(data)
            L = cross_ent_loss(output, target)
            training_loss += nd.sum(L).asscalar()
            L.backward()
        trainer.step(data.shape[0])
    
    # Calculate validation loss
    val_predictions = model(val_text_transformed.as_in_context(context))
    val_loss = nd.sum(cross_ent_loss(val_predictions, val_label)).asscalar()
    
    # Let's take the average losses
    training_loss = training_loss / len(train_label)
    val_loss = val_loss / len(val_label)
    
    end = time.time()
    print("Epoch %s. Train_loss %f Validation_loss %f Seconds %f" % \
          (epoch, training_loss, val_loss, end-start))

Epoch 0. Train_loss 0.599052 Validation_loss 0.558403 Seconds 16.874201
Epoch 1. Train_loss 0.529220 Validation_loss 0.525489 Seconds 16.610989
Epoch 2. Train_loss 0.499743 Validation_loss 0.507534 Seconds 16.351708
Epoch 3. Train_loss 0.480511 Validation_loss 0.496742 Seconds 16.496183
Epoch 4. Train_loss 0.465536 Validation_loss 0.486157 Seconds 16.189243
Epoch 5. Train_loss 0.453515 Validation_loss 0.477877 Seconds 16.542446
Epoch 6. Train_loss 0.443975 Validation_loss 0.472121 Seconds 16.392223
Epoch 7. Train_loss 0.436183 Validation_loss 0.466666 Seconds 16.401354
Epoch 8. Train_loss 0.429550 Validation_loss 0.462500 Seconds 16.372777
Epoch 9. Train_loss 0.423831 Validation_loss 0.458332 Seconds 16.542792
Epoch 10. Train_loss 0.418815 Validation_loss 0.454241 Seconds 19.967718
Epoch 11. Train_loss 0.414449 Validation_loss 0.452654 Seconds 16.646130
Epoch 12. Train_loss 0.410677 Validation_loss 0.449388 Seconds 16.677658
Epoch 13. Train_loss 0.407193 Validation_loss 0.447839 Second

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Get validation predictions
val_predictions = model(val_text_transformed.as_in_context(context))

val_label = nd.array(val_label)

# Round predictions: 1 if pred>0.5, 0 otherwise
val_predictions = np.round(val_predictions.asnumpy())

print("Classification Report")
print(classification_report(val_label.asnumpy(), val_predictions))
print("Accuracy")
print(accuracy_score(val_label.asnumpy(), val_predictions))

Classification Report
              precision    recall  f1-score   support

         0.0       0.72      0.78      0.75      2627
         1.0       0.86      0.82      0.84      4373

    accuracy                           0.80      7000
   macro avg       0.79      0.80      0.79      7000
weighted avg       0.81      0.80      0.80      7000

Accuracy
0.803
