In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
print(tf.__version__)
print(hub.__version__)

2.2.0
0.8.0


In [None]:
!pip install bert-for-tf2



In [None]:
!pip install sentencepiece



In [None]:
import bert

In [None]:
import pandas as pd
import numpy as np

In [None]:
movie_test = pd.read_csv('Test.csv')
movie_train = pd.read_csv('Train.csv')
movie_valid = pd.read_csv('Valid.csv')

In [None]:
movie_test.columns.values

array(['text', 'label'], dtype=object)

In [None]:
movie_test.label.unique()

array([0, 1])

In [None]:
print(movie_train.shape)
print(movie_test.shape)
print(movie_valid.shape)


(40000, 2)
(5000, 2)
(5000, 2)


In [None]:
import re
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [None]:
reviews = []
sentences = list(movie_train['text'])
for sen in sentences:
    reviews.append(preprocess_text(sen))

In [None]:
reviews[10]

'I can believe people are looking for plot in this film This is Laural and Hardy Lighten up already These two were riot Their comic genius is as funny today as it was years ago Not filthy word out of either mouth and they were able to keep audiences in stitches Their comedy wasn sophisticated by any stretch If whoopee cushion can make you grin there no reason to watch any of the stuff these guys did It was simpler time and people laughed at stuff that was funny without plot guess it takes simple mind to enjoy this stuff so qualify Two man comedy teams don compute We re just too sophisticated Aren we fortunate '

In [None]:
import numpy as np

In [None]:
y = movie_train['label']
y = np.array(list(y))

In [None]:
y[10]

1

In [None]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [None]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

tokenized_reviews = [tokenize_reviews(review) for review in reviews]

In [None]:
reviews_with_len = [(review, y[i])
                 for i, review in enumerate(tokenized_reviews)]


In [None]:
processed_dataset = tf.data.Dataset.from_generator(lambda: reviews_with_len, output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
import math
TOTAL_BATCHES = math.ceil(len(reviews_with_len) / BATCH_SIZE)
print(TOTAL_BATCHES)
TEST_BATCHES = TOTAL_BATCHES // 10
print(TEST_BATCHES)
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

1250
125


In [None]:
from tensorflow.keras import layers
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output


In [None]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [None]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])
    

In [None]:
text_model.fit(train_data, epochs=NB_EPOCHS)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8d3de660f0>

In [None]:
results = text_model.evaluate(test_data)
print(results)


[0.8214634656906128, 0.8585000038146973]


In [None]:
from sklearn import metrics
print("Accuracy",metrics.accuracy_score(data_test[0], df_results[0]))
print("F1-Score",metrics.f1_score(data_test[0], df_results[0],average='weighted'))