In [1]:
import numpy as np
import pandas as pd
import re
import os
import zipfile

import keras
import tensorflow as tf
import tensorflow_datasets as tfds
# import tensorflow_text
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Activation, Dropout, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, GlobalAveragePooling1D, LSTM, Bidirectional
from tensorflow.python.client import device_lib
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV



### Variables

In [2]:
vocab_size = 5000
oov_token = "<OOV>"
test_size = 0.20
embed_dim=100
max_length = 128
padding_type = "post"
trunction_type = "post"

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)

num_folds = 5

### Data Prep

In [3]:
train_data_3 = pd.read_csv("data_v2/Aug_BackTranslation.csv")

from_french = train_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = train_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = train_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = train_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

train_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

val_data_3 = pd.read_csv("data_v2/Aug_Val_BackTranslation.csv")

from_french = val_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = val_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = val_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = val_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

val_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

train_data_1 = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
train_data_2 = pd.read_csv("data_v2/Aug_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
train_data = pd.concat([train_data_1, train_data_2, train_data_3], ignore_index=True)

val_data_1 = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])
val_data_2 = pd.read_csv("data_v2/Aug_Val_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
val_data = pd.concat([val_data_1, val_data_2, val_data_3], ignore_index=True)


test_data = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

In [4]:
# train_data = pd.concat([train_data, val_data], ignore_index=True)
train_data.dropna(inplace=True)
train_data = train_data.sample(frac=1)
val_data.dropna(inplace=True)

train_data.reset_index(inplace=True)

In [6]:
X_train = train_data["reviewText"].to_list()
y_train = train_data["sentiment"].tolist()

X_test = test_data["reviewText"].to_list()
y_test = test_data["sentiment"].tolist()

X_val = val_data["reviewText"].to_list()
y_val = val_data["sentiment"].tolist()

### Tokenizing

In [7]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

### Sequences

In [8]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_test_sequences = tokenizer.texts_to_sequences(X_test)


### Padding

In [9]:
X_train_padded = pad_sequences(X_train_sequences, 
                               maxlen = max_length, 
                               padding = padding_type, truncating = 
                               trunction_type)

X_val_padded = pad_sequences(X_val_sequences, 
                            maxlen = max_length, 
                            padding = padding_type,
                            truncating = trunction_type)

X_test_padded = pad_sequences(X_test_sequences,
                             maxlen = max_length,
                             padding = padding_type,
                             truncating = trunction_type)

In [13]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

## Word Embeddings
### Getting embedding indexes from http://nlp.stanford.edu/data/glove.6B.zip
### Using glove.6B.100d.txt included in this notebook

In [10]:
embeddings_index = {}
f = open('glove.6B/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


### Embedding training set, if a word is not in the stanford embedding index it will be represented as a 0

In [11]:
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Create Embedding Layer

In [12]:
embedding_layer = Embedding(input_dim = len(word_index) + 1,
                           output_dim = embed_dim,
                           weights = [embedding_matrix],
                           input_length = max_length,
                           trainable = False)

### Model Build

In [17]:

model = Sequential([
        embedding_layer,
        Conv1D(64, 3, activation='relu'),
        Dropout(0.5),
        Bidirectional(LSTM(16)),
        Dropout(0.5),
        Dense(16, kernel_regularizer=regularizers.l2(0.001),activation='relu'),
        Dense(1, activation='sigmoid')
    ])


model.compile(loss = 'binary_crossentropy', optimizer =tf.keras.optimizers.SGD(0.001)
              , metrics = ['accuracy'])


checkpoint_filepath = 'cnn_lstm_checkpoint/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model_checkpoint_earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=5)



### Model Training

In [19]:
history = model.fit(x=X_train_padded, y=y_train, batch_size=16, 
          validation_data=(X_val_padded, y_val),
          callbacks=[model_checkpoint_callback, model_checkpoint_earlyStopping],
          epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25


In [20]:
model.evaluate(x=X_test_padded, y=y_test)



[0.658517062664032, 0.8838269114494324]

### Threshold Estimation

In [21]:
from sklearn.metrics import classification_report,precision_recall_curve, f1_score

In [22]:
def f1_score_manual(y, y_pred):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for actual_value, predicted_value in zip(y, y_pred):
        if predicted_value == actual_value:  # t?
            if predicted_value:  # tp
                tp += 1
            else:  # tn
                tn += 1
        else:  # f?
            if predicted_value:  # fp
                fp += 1
            else:  # fn
                fn += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    print('----------------------------------')
    print('                 Actual Value')
    print('----------------------------------')
    print(f'            Positive    Negative')
    print(f'Positive    {tp:^8}    {fp:^8}')
    print(f'Negative    {fn:^8}    {tn:^8}')
    print('----------------------------------')
    return f1


def get_best_threshold_by_f1(y_true, y_pred, thresholds):
    best_score = -1
    best_threshold=-1
    
    for th in thresholds:
        y_pred_label = [int(yi>=th) for yi in y_pred]
        score = f1_score(y_true, y_pred_label, average="macro")
        if score > best_score:
            best_score = score
            best_threshold = th
    
    return best_threshold, best_score
    

In [23]:
train_data_1 = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
val_data_1 = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])

train_data = pd.concat([train_data_1, val_data_1], ignore_index=True)
train_data.dropna(inplace=True)


In [24]:
X_train = train_data["reviewText"].to_list()
y_train = train_data["sentiment"].tolist()
y_train = np.array(y_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, 
                               maxlen = max_length, 
                               padding = padding_type, truncating = trunction_type)


In [25]:
train_predict = model.predict(X_train_padded)

2022-12-20 23:53:15.493159: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-12-20 23:53:15.564774: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-12-20 23:53:15.571713: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [26]:
test_predict = model.predict(X_test_padded)

In [29]:
train_predict = train_predict.ravel()
test_predict = test_predict.ravel()

In [32]:
precision, recall, thresholds = precision_recall_curve(y_train, train_predict)

best_threshold, best_score = get_best_threshold_by_f1(y_train, train_predict, thresholds)

### Performance on Test Data

In [33]:
test_predict_labels = [int(x>=best_threshold) for x in test_predict]


f1_score_manual(y_test, test_predict_labels)
print(classification_report(y_test, test_predict_labels))


----------------------------------
                 Actual Value
----------------------------------
            Positive    Negative
Positive      377          34   
Negative       16          12   
----------------------------------
              precision    recall  f1-score   support

           0       0.43      0.26      0.32        46
           1       0.92      0.96      0.94       393

    accuracy                           0.89       439
   macro avg       0.67      0.61      0.63       439
weighted avg       0.87      0.89      0.87       439

