# Importing Dependencies

In [None]:
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import math
import random 

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30531 sha256=fbc689295861933297608216e690f488dd1f20feee45eb9dcc32d7f5cde50576
  Stored in directory: /root/.cache/pip/wheels/6f/c7/91/f2b2c2b3cec30578c5de7c27ac996

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers 
import bert

# Loading Files

In [None]:
cols = ['sentiment', 'id', 'data', 'query', 'user', 'text']
df = pd.read_csv("/content/drive/MyDrive/Datasets/train.csv",
                 header=None,
                 names=cols,
                 engine="python",
                 encoding="latin1" 
                 )

In [None]:
df.head(3)

Unnamed: 0,sentiment,id,data,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [None]:
df.drop(['id', 'data', 'query', 'user'],
        axis = 1,
        inplace = True
        )

In [None]:
df.head(3)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...


# Cleaning

In [None]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'lxml').get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ',tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']",' ',tweet)
  tweet = re.sub(r" +",' ', tweet)
  return tweet

In [None]:
df_clean = [clean_tweet(tweet) for tweet in df.text]

  tweet = BeautifulSoup(tweet, 'lxml').get_text()


In [None]:
data_labels = df.sentiment.values

In [None]:
data_labels[data_labels ==4] = 1 # in the data set we have data of 0 and 4, so we are making the data label in 0 and 1

# Tokenization

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable = False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
tokenizer.tokenize("My dog loves strawberries.")

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog loves strawberries."))

[2026, 3899, 7459, 13137, 20968, 1012]

In [None]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs  = [encode_sentence(sentence) for sentence in df_clean]

# Dataset creation

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

random.shuffle(data_with_len)
data_with_len.sort(key = lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
for sent_lab in data_with_len if sent_lab[2]>7]

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, output_types = (tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([2008, 4165, 1012, 1012, 1012, 2200, 2137, 1012], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ),()))

In [None]:
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES) 

test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.take(NB_BATCHES_TEST)

# Model Building

In [None]:
class DCNN(tf.keras.Model):
  
  def __init__(self, 
               vocab_size, 
               emb_dim = 128, 
               nb_filters=50,
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False, 
               name="dcnn"
               ):
    super(DCNN, self).__init__(name=name)

    self.embedding  = layers.Embedding(vocab_size, emb_dim)

    self.biagram = layers.Conv1D(filters=nb_filters,
                                kernel_size = 2,
                                padding="valid", 
                                activation="relu")
    
    self.trigram = layers.Conv1D(filters=nb_filters,
                                kernel_size = 3,
                                padding="valid", 
                                activation="relu"
                                )
    
    self.fourgram = layers.Conv1D(filters=nb_filters,
                                kernel_size = 4,
                                padding="valid", 
                                activation="relu"
                                )
    
    self.pool = layers.GlobalMaxPooling1D()

    self.dense_1 = layers.Dense(units = FFN_units,
                                activation = "relu")
    
    self.dropout = layers.Dropout(rate=dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units = 1, activation="sigmoid")
    else:
      self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
    
    def call(self, inputs, training):
      x = self.embedding(inputs)
      
      x_1 = self.biagram(x)
      x_1 = self.pool(x_1)

      x_2 = self.trigram(x)
      x_2 = self.pool(x_2)

      x_3 = self.fourgram(x)
      x_3 = self.pool(x_3)

      merged = tf.concat([x_1, x_2, x_3], axis = 1) # (batch_size, 3*nb_filters)
      merged = self.dense_1(merged)
      merged = self.droput(merged, training)
      output = self.last_dense(merged)

      return output


  


# Model Training 

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2 
NB_EPOCHS  = 5

In [None]:
Dcnn = DCNN(
    vocab_size = VOCAB_SIZE,
    emb_dim = EMB_DIM, 
    nb_filters=NB_FILTERS,
    FFN_units=FFN_UNITS,
    nb_classes=NB_CLASSES,
    dropout_rate=DROPOUT_RATE,
)

In [None]:
if NB_CLASSES == 2:
  Dcnn.compile(loss = "binary_crossentropy", 
               optimizer="adam",
               metrics = ["accuracy"]
               )
  
else: 
    Dcnn.compile(loss = "sparse_categorical_crossentropy", 
               optimizer="adam",
               metrics = ["sprase_categorical_accuracy"]
               )

In [None]:
# we want to get back the weights that we are trained on to getting
# to save the state of the model or the optimizer we use this, 

checkpoint_path = "/content/drive/MyDrive/Datasets"
ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

In [None]:
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

In [None]:
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest checkpoint has been resotred")

In [None]:
class MyCustomCallBack(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs=None):
    ckpt_manager.save()
    print("checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset, 
         epochs= NB_EPOCHS,
         callbacks=[MyCustomCallBack()]
         )

Epoch 1/5


NotImplementedError: ignored

# Evaluation 

In [None]:
results  = Dccn.evaluate(test_dataset)
print(results)

In [None]:
def get_prediction(sentence):
  tokens = encode_sentence(sentence)
  inputs = tf.expand_dims(tokens, 0)

  output = Dcnn(inputs, training=False)

  sentiment = math.floor(output*2)


  if(sentiment == 0):
    print("output of the model: {}\n Predicted sentiment: negative".format(output))
  elif sentiment == 1:
    print("output of the model: {}\n Predicted sentiment: positive".format(output))
