In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
from glove import *
import myUtils

import pandas as pd
from pandas.io.json._normalize import nested_to_record
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

**Load GloVe model and dataset**

In [3]:
glove_model = GloveModel.from_pretrained('../trained_models/el.glove.300.txt')

In [4]:
review_dict = myUtils.read_json_v2('../data/reviews_revision.json')
reviews = pd.DataFrame.from_dict(nested_to_record(review_dict, sep='_'))
reviews.drop(columns='_id_$oid', inplace=True)

# Drop columns that will not be used
reviews.drop(columns=['meta_product_name',\
                      'meta_review_pros', 'meta_review_so-so', 'meta_review_cons'], inplace=True)
# Drop all product types except top 4 with most instances
reviews.drop(index=reviews[(reviews['meta_product_type'] != 'Κινητά Τηλέφωνα') & \
                           (reviews['meta_product_type'] != 'Gaming Headsets') & \
                           (reviews['meta_product_type'] != 'PS4 Games') & \
                           (reviews['meta_product_type'] != 'Activity Trackers')].index.tolist(),
             inplace=True)

**Get ids from words using GloVe model**

In [5]:
reviews['ids'] = reviews['text'].apply(glove_model.string_to_ids)
reviews['ids_length'] = reviews['ids'].apply(len)

**Remove outliers (entries with very long length)**

In [6]:
q_hi = reviews['ids_length'].quantile(0.95)
reviews = reviews[reviews['ids_length'] < q_hi].reset_index(drop=True)

In [7]:
reviews['ids_length'].describe()

count    588.000000
mean      68.221088
std       48.919174
min        5.000000
25%       32.000000
50%       54.000000
75%       92.000000
max      221.000000
Name: ids_length, dtype: float64

In [8]:
reviews.sample(5)

Unnamed: 0,text,meta_product_type,meta_review_sentiment,ids,ids_length
136,Έχει την πλάκα του μέχρι ένα σημείο γιατί σε β...,Activity Trackers,positive,"[43, 9, 3010, 5, 95, 36, 348, 96, 16, 2882, 41...",97
319,Παρηγγείλα τα ακουστικά Zeroground μετά από ολ...,Gaming Headsets,neutral,"[39714, 14, 9689, 69, 13, 179, 31, 1810, 4762,...",59
178,Για έναν μέσο χρήστη είναι υπέρ αρκετό!! Η μπα...,Κινητά Τηλέφωνα,positive,"[11, 137, 928, 1542, 17, 793, 2857, 40, 40, 8,...",47
219,Φοβερό παιχνίδι. Παρότι δεν είμαι φανατικός με...,PS4 Games,positive,"[9341, 380, 2, 2884, 28, 338, 25036, 10, 4, 63...",31
472,Ίσως το χειρότερο κινητό τηλέφωνο έφυγα από τη...,Κινητά Τηλέφωνα,negative,"[301, 4, 3885, 2224, 1397, 16265, 13, 9, 4505,...",52


**Pad ids of each entry to match their length**

In [9]:
padded_ids = keras.preprocessing.sequence.pad_sequences(
    reviews['ids'].to_numpy(), padding="post", value=0
)

**Transform target labels to one-hot encoding**

In [10]:
onehot = OneHotEncoder(sparse=False)
encoded_sentiment = onehot.fit_transform(reviews['meta_review_sentiment'].values.reshape(-1,1))

**Fit model**

In [11]:
# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
acc_per_fold = []
loss_per_fold = []
for train, test in kfold.split(padded_ids, encoded_sentiment):

    # Define the model architecture
    embeddings_layer = keras.layers.Embedding(
        input_dim=glove_model.emb_norm.shape[0], output_dim=glove_model.emb_norm.shape[1],
        weights=[glove_model.emb_norm], name='embeddings', mask_zero=True
    )
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(padded_ids.shape[1],), name='inputs'),
        embeddings_layer,
        keras.layers.Bidirectional(tf.keras.layers.LSTM(128), name='BLSTM'),
        keras.layers.Dense(3, activation='softmax', name='linear_softmax')
    ])

    # Compile the model
    model.compile(loss=keras.losses.CategoricalCrossentropy(),
                  optimizer=keras.optimizers.Adam(1e-4),
                  metrics=keras.metrics.CategoricalAccuracy())

    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    # Fit data to model
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1,
                                                      restore_best_weights=True)
    history = model.fit(padded_ids[train], encoded_sentiment[train],
                        validation_split=0.25,
                        batch_size=32,
                        epochs=50,
                        callbacks=[early_stopping])

    # Generate generalization metrics
    scores = model.evaluate(padded_ids[test], encoded_sentiment[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Restoring model weights from the end of the best epoch.
Epoch 00024: early stopping
Score for fold 1: loss of 0.9467569589614868; categorical_accuracy of 51.69491767883301%
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epo

**Provide average scores**

In [12]:
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.9467569589614868 - Accuracy: 51.69491767883301%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.8705316185951233 - Accuracy: 63.55932354927063%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.8849905729293823 - Accuracy: 65.25423526763916%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.8941837549209595 - Accuracy: 61.538463830947876%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.9348123669624329 - Accuracy: 57.26495981216431%
------------------------------------------------------------------------
Average scores for all folds:
> Accuracy: 59.862380027770996 (+- 4.8785814381208334)
> Loss: 0.906255054473877
------------------------------------------------------------------------
