# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l[K     |████████                        | 10 kB 23.0 MB/s eta 0:00:01[K     |████████████████                | 20 kB 26.3 MB/s eta 0:00:01[K     |███████████████████████▉        | 30 kB 25.5 MB/s eta 0:00:01[K     |███████████████████████████████▉| 40 kB 18.8 MB/s eta 0:00:01[K     |████████████████████████████████| 41 kB 133 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=71289f262db34a627ed98e6600688e66422d92a817babd9dfbff01fe66f42688
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Buil

In [3]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [4]:
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/trainingandtestdata/training.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [7]:
data.head(10)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [8]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

## Preprocessing

### Cleaning

In [9]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Delete the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Delete URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Just keep letters and important punctuation
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Remove additional spaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [10]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [12]:
data_clean


[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!",
 ' I dived many times for the ball. Managed to save The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 " no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. ",
 ' not the whole crew ',
 'Need a hug ',
 " hey long time no see! Yes.. Rains a bit only a bit LOL I'm fine thanks how's you ?",
 " K nope they didn't have it ",
 ' que me muera ? ',
 "spring break in plain city... it's snowing ",
 'I just re pierced my ears ',
 " I couldn't bear to watch it. And I thought the UA loss was embarrassing . . . . .",
 ' It it counts idk why I did either. you never talk to me anymore ',
 " i would've been the first but i didn't have a gun. not really though zac snyder's just a doucheclown.",
 ' I wish I got to watch it with you!! I miss you

In [13]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [14]:
data.sentiment.unique()

array([0, 4])

In [15]:
data.sentiment.values

array([0, 0, 0, ..., 4, 4, 4])

In [19]:
data[data['sentiment']==4]

Unnamed: 0,sentiment,text
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! ...
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,4,Being sick can be really cheap when it hurts t...
800004,4,@LovesBrooklyn2 he has that effect on everyone
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


0 means negative feeling
4 means positive feeling
we wil replace 4 with 1 to make more precide.

In [20]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [21]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [22]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [23]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [24]:
data_inputs

[[22091,
  2860,
  2860,
  2008,
  1005,
  1055,
  1037,
  26352,
  5017,
  1012,
  2017,
  2323,
  2050,
  2288,
  2585,
  12385,
  1997,
  2353,
  2154,
  2000,
  2079,
  2009,
  1012,
  1040],
 [2003,
  6314,
  2008,
  2002,
  2064,
  1005,
  1056,
  10651,
  2010,
  9130,
  2011,
  3793,
  2075,
  2009,
  1012,
  1012,
  1012,
  1998,
  2453,
  5390,
  2004,
  1037,
  2765,
  2082,
  2651,
  2036,
  1012,
  27984,
  999],
 [1045,
  11529,
  2094,
  2116,
  2335,
  2005,
  1996,
  3608,
  1012,
  3266,
  2000,
  3828,
  1996,
  2717,
  2175,
  2041,
  1997,
  19202],
 [2026, 2878, 2303, 5683, 2009, 11714, 1998, 2066, 2049, 2006, 2543],
 [2053,
  2009,
  1005,
  1055,
  2025,
  2022,
  3270,
  6455,
  2012,
  2035,
  1012,
  1045,
  1005,
  1049,
  5506,
  1012,
  2339,
  2572,
  1045,
  2182,
  1029,
  2138,
  1045,
  2064,
  1005,
  1056,
  2156,
  2017,
  2035,
  2058,
  2045,
  1012],
 [2025, 1996, 2878, 3626],
 [2342, 1037, 8549],
 [4931,
  2146,
  2051,
  2053,
  2156,
  999,
 

### Dataset creation

We will create padded batches (so we pad sentences for each batch inpedendently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [25]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [26]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [27]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 2183,  2000,  1996,  3509,  2007,  2026,  2502, 24761],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [28]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [29]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2183,  2000,  1996,  3509,  2007,  2026,  2502, 24761],
        [ 5292,  3270,  3461,  2912,  2100,   999,   999,   999],
        [ 2008,  2502,  2567,  2097,  2022,  1996,  2197,  1029],
        [ 4394, 21146,  2480,  1012,  1012,  1012,  1012,  1012],
        [ 2008,  2003,  1999, 12476,  2933,  2031,  4569,   999],
        [ 2893,  3201,  2000,  2681,  2005,  9880,  3218,  1012],
        [ 1045,  2215,  2000,  2022, 20934, 16515, 13749,   999],
        [ 2633,  2772,  2039,  2000, 10474,   999,  7592,  3071],
        [ 1056, 28394, 21246,  2594,  2145,  2025,  2551,  2295],
        [ 2017,  4364,  2024, 24665,  4215,  6692,  7629,  1012],
        [ 2003,  2055,  2000,  2175,  2131,  2014,  2606, 12690],
        [ 1057,  5603,  1012,  1012,  1012, 18650,  2003,  2067],
        [ 1056, 28394,  3436,  2013,  2047,  3042,  1012,  4658],
        [ 2009,  2069, 13403,  2043,  2017,  4756,  8840,  2140],
        [ 2065,  2069,  2026

In [30]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Stage 3: Model building

In [31]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [32]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [33]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [34]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [35]:
checkpoint_path = "./drive/MyDrive/projects/BERT/ckpt_bert_tok/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored!")

In [36]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [37]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
  37196/Unknown - 769s 20ms/step - loss: 0.4289 - accuracy: 0.8025Checkpoint saved at ./drive/MyDrive/projects/BERT/ckpt_bert_tok/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f81733f1c90>

# Stage 5: Evaluation

In [38]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.41142186522483826, 0.8322165608406067]


In [39]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [40]:
get_prediction("This movie was pretty interesting.")

Output of the model: [[0.85946935]]
Predicted sentiment: positive.


In [41]:
get_prediction("I'd rather not do that again.")

Output of the model: [[0.24145938]]
Predicted sentiment: negative.
