# Connecting to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/INM706 Project')

# Data Preprocessing
The first thing is to clean and prepare the data that we want to work on. Removing punctuations, creating train and test split, and make Pytorch dataloaders.

In [None]:
import numpy as np
# read data from text files
with open('reviews.csv', 'r') as f:
     reviews_lines = f.readlines()

In [None]:
# we can see headers for out dataset
reviews_lines[0]

'reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId\n'

In [None]:
# we would just use the content of each review and the score of it
# we should note that each element of 'reviews' need to be splited
# by commas.

review_1 = reviews_lines[1].split(',')[3]
score_1  = reviews_lines[1].split(',')[4]

print('review:',review_1,'\nscore:',score_1)

review: I cannot open the app anymore 
score: 1


In [None]:
# we can load our dataset with pandas to avoid missing any type of data.
# with this function we would extract just reviews and scores

import numpy as np
import pandas as pd

def extract_data(dataset):
  """input: google play dataset
  output: reviews and scores"""
  data = pd.read_csv('reviews.csv')
  reviews = data['content']
  scores  = data['score']
  return reviews,scores

reviews_array, scores_array = extract_data(reviews_lines)
reviews_lines[430].split(',')



['We apologize for the inconvenience and promise to bring those back should Google allow it in the future :)"',
 '2020-09-27 12:18:35',
 'newest',
 'com.anydo\n']

In [None]:
# with this function we would remove all emojies in a text

import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
# we would now build the dataset inference based on pytorch 'dataset' class
# we would also remove stopwords, punctuation and emojies while loading the
# dataset. meanwhile we would build the vocabulary when we load the dataset

import nltk
from torch.utils.data import Dataset
import torch
from string import punctuation

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

class Google_play_Dataset(Dataset):
  "Google play revies dataset"
  def __init__(self, csv_file):
      """
      Args:
          csv_file (string): Path to the csv file.
          transform (callable, optional): Optional transform to be applied
              on a sample.
      """
      self.dataset = pd.read_csv(csv_file)
      self.vocab = list()

  def __len__(self):
      return len(self.dataset)

  def __getitem__(self, idx):
      if torch.is_tensor(idx):
          idx = idx.tolist()

      text = "".join([ch for ch in self.dataset['content'][idx] 
                      if ch not in punctuation]).lower()
      review = remove_emoji(text)
      score  = self.dataset['score'][idx]

      edited_review = ''
      for word in review.split():   # at this part we wouldn't save stop words
        if word not in stop_words:  # in our vocabulary
          self.vocab.append(word)
          edited_review = edited_review + ' ' + word

      sample = {'review': edited_review, 'score': score}

      return sample

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
dataset_loader = Google_play_Dataset('reviews.csv')

In [None]:

max_size = 0

for i in range(len(dataset_loader)):
    sample = dataset_loader[i]
    if i < 5:
        print(i, sample['review'], sample['score'])


0  cannot open app anymore 1
1  begging refund app month nobody replying 1
2  costly premium version approx indian rupees 910 per year better download premium version app apkmos website use microsoft list app far better 1
3  used keep organized 2020 updates made mess things cudnt u leave well enuf alone guess ur techies feel need keep making changes justify continuing collect salary 1
4  dan birthday oct 28 1


In [None]:
# sorting words in the datset based on number of occurance in the dataset

from collections import Counter 

count_words = Counter(dataset_loader.vocab)
total_words=len(dataset_loader.vocab)

sorted_words=count_words.most_common(total_words)

print("Top ten occuring words : ", sorted_words[:10])

Top ten occuring words :  [('app', 15144), ('good', 3448), ('like', 3444), ('use', 3208), ('time', 3050), ('tasks', 2978), ('great', 2970), ('would', 2656), ('really', 2416), ('calendar', 2374)]


In [None]:
# change words in vocabulary to integers base on their
# position in sorted words

integer_based_vocab={w:i+1 for i,(w,c) in enumerate(sorted_words)}
print(integer_based_vocab)



In [None]:
def encode_reviews(dataset,vocab_int):
  all_scores =list()
  all_reviews=list()
  for index in range(len(dataset)):
    review = dataset[index]['review']
    encoded = list()
    for word in review.split():
      if word not in vocab_int.keys():
        encoded.append(0)
      else:
        encoded.append(int(vocab_int[word]))

    all_reviews.append(encoded)
    all_scores.append(dataset[index]['score'])

  return all_reviews, all_scores

  
encoded_data, all_scores = encode_reviews(dataset_loader, integer_based_vocab)

counter = 0
for tokenized in encoded_data:
  print(tokenized)
  sample = dataset_loader[counter]
  print(sample['review'])
  counter += 1
  if counter == 5:
    break


[170, 112, 1, 283]
 cannot open app anymore
[2475, 645, 1, 195, 1871, 2476]
 begging refund app month nobody replying
[1981, 24, 15, 4302, 4303, 3081, 3556, 523, 206, 54, 354, 24, 15, 1, 5938, 646, 4, 263, 20, 1, 69, 54]
 costly premium version approx indian rupees 910 per year better download premium version app apkmos website use microsoft list app far better
[46, 56, 268, 818, 326, 158, 1220, 62, 5939, 296, 688, 59, 5940, 1178, 590, 940, 5941, 253, 26, 56, 215, 332, 1520, 2477, 1872, 5942]
 used keep organized 2020 updates made mess things cudnt u leave well enuf alone guess ur techies feel need keep making changes justify continuing collect salary
[3082, 1982, 2271, 2121]
 dan birthday oct 28


In [None]:
# in this step we will cut long reviews with a maximum threshold
# and fill short reviews with <UNK> token

from torch.nn.functional import one_hot

max_length = 50
encoded_array=np.zeros((len(encoded_data), max_length), dtype='int')



for i, review in enumerate(encoded_data):
  review_len=len(review)
  if review_len <= max_length:
    zeros=list(np.zeros(max_length-review_len))
    new = review + zeros
  else:
    new=review[:max_length]
  encoded_array[i,:]=np.array(new)

print(dataset_loader[2857]['review'])

print(encoded_array.shape)
print(encoded_array[2857])


scores = np.array(all_scores)
# scores = np.eye(5)[scores]
# print(scores.shape)
# print(scores[2857])
np.array([scores]).reshape(-1)

oneHhot_scores = np.zeros()

np.eye(5)[scores.reshape(-1)]

 theres button method saving changes deleting tasks meaning cant use app
(12495, 50)
[ 178  264 1104  889  332  503    6 1763   17    4    1    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


IndexError: ignored

In [None]:
train_x=encoded_array[:10050]
train_y=scores[:10050]
valid_x=encoded_array[10050:11250]
valid_y=scores[10050:11250]
test_x=encoded_array[11250:]
test_y=scores[11250:]
print(len(train_y), len(valid_y), len(test_y))

10050 1200 1245


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data=TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data=TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

#dataloader
batch_size=15
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
import torch.nn as nn
 
class SentimentalLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):    
        """
        Initialize the model by setting up the layers
        """
        super().__init__()
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        #Embedding and LSTM layers
        self.embedding=nn.Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        #dropout layer
        self.dropout=nn.Dropout(0.3)
        
        #Linear and sigmoid layer
        self.fc1=nn.Linear(hidden_dim, 64)
        self.fc2=nn.Linear(64, 16)
        self.fc3=nn.Linear(16,output_size)
        self.sigmoid=nn.Sigmoid()
        
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size=x.size()
        
        #Embadding and LSTM output
        embedd=self.embedding(x)
        lstm_out, hidden=self.lstm(embedd, hidden)
        
        #stack up the lstm output
        lstm_out=lstm_out.contiguous().view(-1, self.hidden_dim)
        
        #dropout and fully connected layers
        out=self.dropout(lstm_out)
        out=self.fc1(out)
        out=self.dropout(out)
        out=self.fc2(out)
        out=self.dropout(out)
        out=self.fc3(out)
        sig_out=self.sigmoid(out)
        
        sig_out=sig_out.view(batch_size, -1)
        sig_out=sig_out[:, -1]
        
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [None]:
vocab_size = len(integer_based_vocab)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 4

net = SentimentalLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)


SentimentalLSTM(
  (embedding): Embedding(13502, 400)
  (lstm): LSTM(400, 256, num_layers=4, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

# training params

epochs = 3 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float().squeeze())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs.cuda(), labels.cuda()  
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float().squeeze())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/3... Step: 100... Loss: -240.000015... Val Loss: -208.750011
Epoch: 1/3... Step: 200... Loss: -213.333344... Val Loss: -208.750010
Epoch: 1/3... Step: 300... Loss: -193.333344... Val Loss: -208.750012
Epoch: 1/3... Step: 400... Loss: -206.666672... Val Loss: -208.750010
Epoch: 1/3... Step: 500... Loss: -113.333336... Val Loss: -208.750010
Epoch: 1/3... Step: 600... Loss: -233.333344... Val Loss: -208.750010
Epoch: 2/3... Step: 700... Loss: -226.666672... Val Loss: -208.750010
Epoch: 2/3... Step: 800... Loss: -233.333344... Val Loss: -208.750010
Epoch: 2/3... Step: 900... Loss: -206.666672... Val Loss: -208.750011
Epoch: 2/3... Step: 1000... Loss: -220.000015... Val Loss: -208.750010
Epoch: 2/3... Step: 1100... Loss: -200.000015... Val Loss: -208.750011
Epoch: 2/3... Step: 1200... Loss: -200.000015... Val Loss: -208.750009
Epoch: 2/3... Step: 1300... Loss: -226.666672... Val Loss: -208.750010
Epoch: 3/3... Step: 1400... Loss: -153.333344... Val Loss: -208.750010
Epoch: 3/3... S

In [None]:
test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()


    output, h = net(inputs, h)

    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float().squeeze())
    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: -239.759
Test accuracy: 0.161
