<a href="https://colab.research.google.com/github/Vigneswaran978/NLP/blob/master/Bert_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/a1/acb891630749c56901e770a34d6bac8a509a367dd74a05daf7306952e910/bert-for-tf2-0.14.9.tar.gz (41kB)
[K     |████████                        | 10kB 24.7MB/s eta 0:00:01[K     |████████████████                | 20kB 15.5MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 9.7MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 8.3MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.1MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/aa/e0/4f663d8abf83c8084b75b995bd2ab3a9512ebc5b97206fde38cef906ab07/py-params-0.10.2.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [

In [4]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [5]:
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/tweets/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


In [7]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

## Preprocessing

### Cleaning

In [8]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [10]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [14]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [15]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [16]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset creation

We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [18]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

In [20]:
import pandas as pd

In [21]:
pd.DataFrame(data_with_len)

Unnamed: 0,0,1,2
0,"[22091, 2860, 2860, 2008, 1005, 1055, 1037, 26...",0,24
1,"[2003, 6314, 2008, 2002, 2064, 1005, 1056, 106...",0,29
2,"[1045, 11529, 2094, 2116, 2335, 2005, 1996, 36...",0,18
3,"[2026, 2878, 2303, 5683, 2009, 11714, 1998, 20...",0,11
4,"[2053, 2009, 1005, 1055, 2025, 2022, 3270, 645...",0,32
...,...,...,...
1599995,"[2074, 8271, 2039, 1012, 2383, 2053, 2082, 200...",1,12
1599996,"[1996, 21724, 2497, 1012, 4012, 2200, 4658, 20...",1,16
1599997,"[2024, 2017, 3201, 2005, 2115, 28017, 2191, 78...",1,13
1599998,"[3407, 16215, 5798, 2000, 2026, 22017, 1997, 2...",1,19


In [22]:
data_with_len.sort(key=lambda x: x[2])

In [23]:
pd.DataFrame(data_with_len)

Unnamed: 0,0,1,2
0,[],0,0
1,[],0,0
2,[],0,0
3,[],0,0
4,[],0,0
...,...,...,...
1599995,"[3145, 1029, 1029, 1029, 1029, 1029, 1029, 102...",1,114
1599996,"[1029, 1029, 1029, 1029, 1029, 1029, 1029, 102...",1,115
1599997,"[2023, 8327, 22753, 2003, 2524, 999, 999, 999,...",0,116
1599998,"[2420, 6229, 2047, 2327, 6718, 1029, 1029, 102...",1,116


In [24]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [25]:
pd.DataFrame(sorted_all)

Unnamed: 0,0,1
0,"[18414, 2102, 12849, 4747, 7916, 1045, 2066, 2...",1
1,"[2003, 2067, 3773, 13736, 4633, 1045, 1044, 3467]",0
2,"[1045, 3984, 1996, 5785, 20315, 2094, 3372, 2147]",0
3,"[2057, 2099, 2010, 4455, 1012, 2055, 2000, 2681]",0
4,"[2008, 2442, 2031, 2042, 12459, 1012, 1012, 1012]",1
...,...,...
1322467,"[3145, 1029, 1029, 1029, 1029, 1029, 1029, 102...",1
1322468,"[1029, 1029, 1029, 1029, 1029, 1029, 1029, 102...",1
1322469,"[2023, 8327, 22753, 2003, 2524, 999, 999, 999,...",0
1322470,"[2420, 6229, 2047, 2327, 6718, 1029, 1029, 102...",1


In [27]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [29]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([18414,  2102, 12849,  4747,  7916,  1045,  2066,  2009],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [30]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [31]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[18414,  2102, 12849,  4747,  7916,  1045,  2066,  2009],
        [ 2003,  2067,  3773, 13736,  4633,  1045,  1044,  3467],
        [ 1045,  3984,  1996,  5785, 20315,  2094,  3372,  2147],
        [ 2057,  2099,  2010,  4455,  1012,  2055,  2000,  2681],
        [ 2008,  2442,  2031,  2042, 12459,  1012,  1012,  1012],
        [ 3407,  5798,  2017,  4689, 23325,  2075,  7966,   999],
        [ 2125,  2000,  2147,  2000,  3154,  2041,  2026,  4624],
        [ 8840,  6371,  4564,  8589,  2015,  2006,  8692,   999],
        [ 3407,  5798, 17137,  4299,  1057,  2035,  1996,  2190],
        [ 1045,  1005,  1049,  5506, 24471,  2025,  2041,  3892],
        [ 1045,  2228, 10474,  3084,  2033,  5637,  2044,  2601],
        [ 2034,  2305,  1997, 10474,  1012,  1012,  4283,  8201],
        [ 3407,  1045,  2288,  2026,  4274,  1998,  5830,  2067],
        [ 2987,  1005,  1056,  2614,  2066,  4569,  2204,  6735],
        [ 2003,  3666,  7143

In [32]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Stage 3: Model building

In [33]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [44]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 1

In [45]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [46]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [47]:
checkpoint_path = "./content/drive/MyDrive/tweets/ckpt"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [48]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [49]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

  36403/Unknown - 2370s 65ms/step - loss: 0.4281 - accuracy: 0.8031

KeyboardInterrupt: ignored

# Stage 5: Evaluation

In [50]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.3727840185165405, 0.8396508693695068]


In [51]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [57]:
get_prediction("this the way that how things are getting done. but need not to the same way bad boys.")

Ouput of the model: [[0.29782742]]
Predicted sentiment: negative.
