In [1]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [2]:
import os
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Reshape, Embedding, LSTM
from keras.preprocessing import text,sequence
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

In [3]:
df = pd.read_csv('./Dataset/complete10000.csv')
df.count()

polarity    10000
text        10000
dtype: int64

In [4]:
x = df['text'].apply(str).values
y = df['polarity'].values
SEED = 2000


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.1, random_state=SEED)
tk = text.Tokenizer(num_words=200, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',lower=True, split=" ")
tk.fit_on_texts(x_train)
sequences_train = tk.texts_to_sequences(x_train)
sequences_test = tk.texts_to_sequences(x_test)
word_index = tk.word_index
print('Found %s unique tokens.' % len(word_index))


x_train = pad_sequences(sequences_train, maxlen=200)
x_test = pad_sequences(sequences_test , maxlen=200)
x_test = pad_sequences(sequences_test , maxlen=200)

Found 19956 unique tokens.


### Word2Vec + LSTM

In [6]:
word2vec = tf.keras.models.load_model('./Checkpoints/Word2Vec_LSTM.h5')

In [7]:
loss, accuracy = word2vec.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)
print('Model Loss: {:0.4f} | Model Accuracy: {:.4f}'.format(loss, accuracy))

Model Loss: 0.4354 | Model Accuracy: 0.8280


### GloVe + LSTM

In [8]:
glove = tf.keras.models.load_model('./Checkpoints/GloVe_LSTM.h5')

In [9]:
loss, accuracy = glove.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)
print('Model Loss: {:0.4f} | Model Accuracy: {:.4f}'.format(loss, accuracy))

Model Loss: 0.4893 | Model Accuracy: 0.8420


### ELMo + LSTM

In [10]:
elmo = tf.keras.models.load_model('./Checkpoints/ELMO_LSTM.h5')

In [11]:
loss, accuracy = model.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)
print('Model Loss: {:0.4f} | Model Accuracy: {:.4f}'.format(loss, accuracy))

Model Loss: 0.6594 | Model Accuracy: 0.9180


## BERT

In [5]:
class Preprocess:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "polarity"

  def __init__(self, df, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    df_x, df_y = self._prepare(df)
    SEED=2000
    train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=.1, random_state=SEED)

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [train_x, test_x])
    self.train_y=train_y
    self.test_y=test_y

  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[Preprocess.DATA_COLUMN], row[Preprocess.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [6]:
bert_model_name="/home/ritika/PracticeSchool/Project/BERT/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12"

bert_ckpt_dir = bert_model_name
bert_ckpt_file = os.path.join(bert_ckpt_dir, "/home/ritika/PracticeSchool/Project/BERT/uncased_L-12_H-768_A-12/bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "/home/ritika/PracticeSchool/Project/BERT/uncased_L-12_H-768_A-12/bert_config.json")

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "/home/ritika/PracticeSchool/Project/BERT/uncased_L-12_H-768_A-12/vocab.txt"))

classes = df.polarity.unique().tolist()

data = Preprocess(df, tokenizer, classes, max_seq_len=128)

max_seq_len = data.max_seq_len

10000it [00:03, 3000.76it/s]
  return np.array(x), np.array(y)


max seq_len 168


### BERT + LSTM

In [22]:
import tensorflow as tf;
bert_lstm = tf.keras.models.load_model('./Checkpoints/BERT_LSTM.h5', compile = False,
                                   custom_objects={'BertModelLayer': BertModelLayer,
                                                   'Functional':tf.keras.models.Model})

In [25]:
bert_lstm.compile(
  optimizer=tf.keras.optimizers.Adam(1e-5),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [26]:
loss, accuracy = bert_lstm.evaluate(x=data.test_x, y=data.test_y, verbose=1)
print('Model Loss: {:0.4f} | Model Accuracy: {:.4f}'.format(loss, accuracy))

Model Loss: 0.1432 | Model Accuracy: 0.9650


### BERT + CLR

In [34]:
import tensorflow as tf;
bert_clr = tf.keras.models.load_model('./Checkpoints/BERT_CLR.h5', compile = False,
                                   custom_objects={'BertModelLayer': BertModelLayer,
                                                   'Functional':tf.keras.models.Model})

In [35]:
bert_clr.compile(
  optimizer=tf.keras.optimizers.SGD(0.9),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [36]:
loss, accuracy = bert_clr.evaluate(x=data.test_x, y=data.test_y, verbose=1)
print('Model Loss: {:0.4f} | Model Accuracy: {:.4f}'.format(loss, accuracy))

Model Loss: 0.1453 | Model Accuracy: 0.9610


### BERT + CLR + LSTM

In [7]:
import tensorflow as tf;
bert_clr_lstm = tf.keras.models.load_model('./Checkpoints/BERT_LSTM_CLR.h5', compile = False,
                                   custom_objects={'BertModelLayer': BertModelLayer,
                                                   'Functional':tf.keras.models.Model})

In [8]:
bert_clr_lstm.compile(
  optimizer=tf.keras.optimizers.SGD(0.9),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [10]:
loss, accuracy = bert_clr_lstm.evaluate(x=data.test_x, y=data.test_y, verbose=1)
print('Model Loss: {:0.4f} | Model Accuracy: {:.4f}'.format(loss, accuracy))

Model Loss: 0.0621 | Model Accuracy: 0.9790
