In [None]:
!pip install -q -U keras-tuner # install keras-tuner for hyper-parameters optimization

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, Dropout, Conv1D, BatchNormalization, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
from google.colab import drive
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
import keras_tuner
from keras_tuner.tuners import RandomSearch
from matplotlib import pyplot as plt

# Global variables
checkpoint_filepath = '/tmp/checkpoint.weights.h5'
BATCH_SIZE = 64

# Data Loading and Pre-processing

In [None]:
#domain = "Social_Networking"
#domain = "Games"
domain = "Productivity"

dataset_url = ""

if domain == "Social_Networking":
  dataset_url = "https://drive.usercontent.google.com/uc?id=1ULtLULPs-8PkOJHxXWKHdN1GhamI4co-&export=download"
elif domain == "Games":
  dataset_url = "https://drive.usercontent.google.com/uc?id=1xrA2aO_0eNK6aQrpa6yf9SHiAzN112I3&export=download"
elif domain == "Productivity":
  dataset_url = "https://drive.usercontent.google.com/uc?id=1vJ1Gek_wVdV2BN_Yv2KdE4bWXeaDn8EP&export=download"


In [None]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,domain,app,review_id,sentence_id,title,review,sentence,rating,is_opinion,category,term,from,to,sentiment,category_id,sentiment_id,category_id_sentiment_id
0,productivity,notability,3dd4f1ab-9433-49b1-91c9-5391373063f8,c6c55c2d-2257-4da4-a959-6622d8619a54,"*updated* Fixed some glitches, bad changes, ne...",*new update* they fixed some import issues and...,"For example, main divider --> dividers A, B, &...",5,True,general,divider,19.0,26.0,positive,6,1,6_1
1,productivity,gmail-email-by-google,43e1aef5-d46a-48bf-a672-cc8bf8410178,6dead894-5be5-4131-8c52-0a0bcc2cfd2a,"Works for email, no split screen","This app works well for email, particularly my...",Even calendar allows a slide over... please ad...,4,True,effectiveness,calendar,6.0,14.0,positive,3,1,3_1
2,productivity,gmail-email-by-google,9aac0ac9-bdd8-42a2-88c0-648d6f6b2cfd,984fe632-4815-46af-8552-6cca28bb995a,Attachment,I acknowledge that one star is a severe respon...,"I was attempting to attach an important, time ...",1,True,reliability,document,57.0,65.0,positive,8,1,8_1
3,productivity,evernote-notes-organizer,0877dc72-b915-49ae-af9b-fd49de658f51,fce9100d-528b-4d3b-b338-a8187340a5a3,Latest upgrade a total FAIL,I want to change my rating from one star to Ze...,"However, the new version just updated to my iP...",1,True,compatibility,update,31.0,37.0,negative,2,0,2_0
4,productivity,things-3,8400a21d-e0b9-44c9-a242-8ab68efd7e86,a81e8c27-a6a6-4c25-8bb4-75da38314822,Dream come True,I am one of those who seldom spare time to wri...,You don’t want to worry about figuring out the...,5,True,general,reminder,59.0,67.0,positive,6,1,6_1


In [None]:
# Split data into X and label
X = df["sentence"]
Y = df["sentiment_id"]

In [None]:
# Display vocabulary size
num_words = len(set([word.lower() for word in X]))
print("Vocabulary size: {}".format(num_words))

# Display dataset size
print("Dataset size: {}".format(len(X)))

Vocabulary size: 3354
Dataset size: 3774


In [None]:
# Pre-process text
max_len = len(max(X, key=len))
tok = Tokenizer()
tok.fit_on_texts(X)
X = tok.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_len)

# DL Models Utils

## Construct models with different hyperparameters

In [None]:
# Construct LSTM

def build_LSTM_model(hp):

    model = Sequential()
    model.add(Embedding(len(tok.word_index), 128))

    model.add(LSTM(
        hp.Int('lstm_first_units', min_value=32, max_value=128, step=32),
        return_sequences=True)
    )

    for i in range(hp.Int('n_layers', 1, 4)):

        model.add(LSTM(
            hp.Int(f'lstm_{i+2}_units', min_value=32, max_value=128, step=32),
            return_sequences=True)
        )

    model.add(LSTM(hp.Int('lstm_last_units', min_value=32, max_value=128, step=32)))

    model.add(Dropout(hp.Float('Dropout_rate', min_value=0, max_value=0.5, step=0.1)))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

    return model

In [None]:
# CNN
def build_CNN_model(hp):

    # define filters hyper-parameters space [32, 64]
    hp_filters = hp.Int('filters', min_value = 32, max_value = 128, step = 32)


    # create model object
    model = Sequential()

    model.add(Embedding(len(tok.word_index) + 1, 128))

    # ---- Tune the number of CNN blocks: 1-3 ----
    # (1 CNN block = convolution, batch normalization, convolution, batch normalization, max pooling, dropout)
    for i in range(1, hp.Int("num_conv_blocks", 2, 4)):

      model.add(Conv1D(filters = hp_filters * i, kernel_size = (3), activation = 'relu'))
      model.add(BatchNormalization())
      model.add(Conv1D(filters = hp_filters * i, kernel_size = (3), activation = 'relu'))
      model.add(BatchNormalization())
      model.add(MaxPooling1D((2)))
      model.add(Dropout(0.2 + (i/10)))

    # ---- Fully-connected layer ----
    #model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # ---- Output layer ----
    model.add(Dense(1, activation='sigmoid'))

    # ---- Compile model ----
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

    return model

## Define Callbacks

In [None]:
# Early stopping callback
early_stopping_callback = EarlyStopping(monitor = 'val_loss', patience = 5)

# Checkpoint callback
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only = True,
    monitor='val_loss',
    save_best_only=True)

## Cross Validation

In [None]:
def cross_validation(best_hps):

  # define 10-fold cross validation test harness
  kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=11)
  cvscores = []
  fold_id = 1

  for train, test in kfold.split(X, Y):

    # Get the best model
    model = tuner.hypermodel.build(best_hps)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Fit the model
    model.fit(
        X[train],
        Y[train],
        batch_size=BATCH_SIZE,
        epochs=100,
        validation_split=0.2,
        callbacks=[early_stopping_callback, model_checkpoint_callback],
        verbose=0)

    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=0)
    print("Accuracy of fold (%s) = %.2f%%" % (fold_id, scores[1]*100))
    fold_id = fold_id + 1
    cvscores.append(scores[1] * 100)

  print("Averaged accuracy = %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

# Experiments

## LSTM

In [None]:
# Tuning LSTM
tuner= RandomSearch(
        build_LSTM_model,
        objective='val_loss',
        max_trials=10,
        overwrite=True
        )

tuner.search(X,
             Y,
             batch_size=BATCH_SIZE,
             epochs=100,
             callbacks = [early_stopping_callback],
             validation_split = 0.2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters()[0]

# Get the best model
hypermodel = tuner.hypermodel.build(best_hps)

# Summary of the best model
print("Best Model Summary:")
hypermodel.summary()

# Retrain the best model with cross validation
cross_validation(best_hps)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
96                |96                |lstm_first_units
4                 |4                 |n_layers
96                |96                |lstm_2_units
32                |32                |lstm_last_units
0.4               |0.4               |Dropout_rate

Epoch 1/100
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 55ms/step - accuracy: 0.5099 - loss: 0.6932 - val_accuracy: 0.5060 - val_loss: 0.6930
Epoch 2/100
[1m11/48[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1s[0m 38ms/step - accuracy: 0.5207 - loss: 0.6934

KeyboardInterrupt: 

## CNN

In [None]:
# Tuning CNN
tuner= RandomSearch(
        build_CNN_model,
        objective='val_loss',
        max_trials=10,
        overwrite=True
        )

tuner.search(X,
             Y,
             batch_size=BATCH_SIZE,
             epochs=100,
             callbacks = [early_stopping_callback],
             validation_split = 0.2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters()[0]

# Get the best model
hypermodel = tuner.hypermodel.build(best_hps)

# Summary of the best model
print("Best Model Summary:")
hypermodel.summary()

# Retrain the best model with cross validation
cross_validation(best_hps)

Trial 2 Complete [00h 00m 01s]

Best val_loss So Far: None
Total elapsed time: 00h 00m 03s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
64                |128               |filters
4                 |3                 |num_conv_blocks

Epoch 1/100


KeyboardInterrupt: 