#### About
Sentiment Analysis using BERT Tokenizer and 1D CNNs

Dataset - https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news

In [1]:
#necessary imports
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import math
import random
#!pip install bert-for-tf2
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

In [13]:
dataset_path = "/home/suraj/ClickUp/Jan-Feb/data/all-data.csv"

In [14]:
cols = ["Sentiment", "Text"]
data = pd.read_csv(dataset_path,header=None, names =cols, encoding='latin1')
data

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [16]:
data.drop_duplicates(inplace=True)
data

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [17]:
#cleaning text
def clean_text(text):
    text = BeautifulSoup(text,"lxml").get_text()
    text = re.sub(r"@[A-Za-z0-9]+"," ",text) # like replace for string
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ',text) # replacing https and ? as s is not conformed
    text = re.sub(r"[^a-zA-Z.!?']"," ",text) # removing everything other than these
    text = re.sub(r" +"," ",text)
    return text

In [23]:
cleaned_text = [clean_text(text) for text in data.Text]
print(cleaned_text[0])



According to Gran the company has no plans to move all production to Russia although that is where the company is growing .


In [39]:
unique_sentiment = np.unique(data.Sentiment.values.tolist())
print(unique_sentiment)

['negative' 'neutral' 'positive']


In [73]:
emotion_mapper= {}
for i,sentiment in enumerate(unique_sentiment):
    emotion_mapper[sentiment]=i 
print(emotion_mapper)

reverse_mapper = {}
for k,v in emotion_mapper.items():
    reverse_mapper[v] = k
print(reverse_mapper)

{'negative': 0, 'neutral': 1, 'positive': 2}
{0: 'negative', 1: 'neutral', 2: 'positive'}


In [44]:
sentiments = [emotion_mapper[sentiment] for sentiment in data.Sentiment]
print(sentiments[0])

1


In [45]:
#tokenization
bert_tokenizer = bert.bert_tokenization.FullTokenizer
#bert layer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenizer(vocab_file, do_lower_case)

In [46]:
# encoding sentence function
def encode_sentences(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [47]:
tokenized_inputs = [encode_sentences(sentence) for sentence in cleaned_text]

In [48]:
#padding with collate equiv
# preparing data
#[list of token, label,seq_len]
encoded_data_with_len = [[sent,sentiments[i],len(sent)] for i, sent in enumerate(tokenized_inputs)]
random.shuffle(encoded_data_with_len)

In [49]:
encoded_data_with_len[0]

[[4082,
  3279,
  23596,
  7327,
  2099,
  1012,
  24098,
  4102,
  2000,
  1037,
  5618,
  1997,
  7327,
  2099,
  1012,
  24098,
  1999,
  1996,
  7978,
  2558,
  1999,
  1012],
 0,
 22]

In [50]:
encoded_data_with_len.sort(key = lambda x:x[2]) # sorting based on seq_len
#only using longer sentences with seq_len >5 for better understanding 
sorted_data=[(var[0],var[1])
            for var in encoded_data_with_len if var[2] > 5]

In [51]:
sorted_data[0]

([3463, 2024, 3517, 2397, 1999, 1012], 1)

In [52]:
#creating tensordataset
train_data = tf.data.Dataset.from_generator(lambda:sorted_data, output_types=(tf.int32,tf.int32))


In [53]:
#checking dataset
next(iter(train_data))

(<tf.Tensor: shape=(6,), dtype=int32, numpy=array([3463, 2024, 3517, 2397, 1999, 1012], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [54]:
num_batches = 64
#padding the tensors -- collate
batched_data = train_data.padded_batch(num_batches, padded_shapes=((None,),()))


In [58]:
print(next(iter(batched_data)))

(<tf.Tensor: shape=(64, 8), dtype=int32, numpy=
array([[ 3463,  2024,  3517,  2397,  1999,  1012,     0,     0],
       [ 2053,  3361,  4751,  2020,  2800,  1012,     0,     0],
       [ 4341,  1997,  5929,  2764,  2190,  1012,     0,     0],
       [ 2053,  3361,  4751,  2020,  3024,  1012,     0,     0],
       [ 2053,  3361,  6987,  2020,  2800,  1012,     0,     0],
       [ 6636,  3872,  3445,  2011,  3155,  1012,     0,     0],
       [ 3361,  4751,  2020,  2025, 21362,  1012,     0,     0],
       [ 2053, 20874,  4751,  2020, 21362,  1012,     0,     0],
       [ 2060,  4751,  2020,  2025,  3024,  1012,     0,     0],
       [ 2035,  2060,  5571,  2020,  7219,  1012,     0,     0],
       [ 7473,  2102,  2896,  2012,  1012,  1012,     0,     0],
       [ 1996,  7909,  3058,  2003,  2233,  1012,     0,     0],
       [ 2053,  3361,  4751,  2020,  2988,  1012,     0,     0],
       [ 1996,  2986,  6140,  2003,  2182,  1012,     0,     0],
       [ 9662,  7368,  2145,  4839,  2174,

In [60]:
#creating train, val dataset
num_batches_train = math.ceil(len(sorted_data)/num_batches)
num_batches_val = num_batches_train//5

batched_data.shuffle(num_batches_train)

val_dataset = batched_data.take(num_batches_val)
train_dataset = batched_data.skip(num_batches_val)

In [61]:
#model building like Pytorch modular class
class SentimentModel(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim=256, num_filters=50,dense_dim=512, num_class=2, dropout_rate=0.2, training=False):
        super(SentimentModel,self).__init__()
        self.embedding_layer = layers.Embedding(vocab_size,embedding_dim)
        #creating cnn layer
        self.bigram = layers.Conv1D(filters=num_filters, kernel_size=2, padding="valid", activation="relu")
        #creating cnn for trigram
        self.trigram = layers.Conv1D(filters=num_filters,kernel_size=3, padding="valid",activation="relu")
        #creating cnn for quadgram
        self.quadgram = layers.Conv1D(filters=num_filters,kernel_size=4, padding="valid",activation="relu")

        #creating a layer which takes max of all outputs
        self.pool = layers.GlobalAveragePooling1D()

        #creating  dense layer with
        self.dense = layers.Dense(units=dense_dim,activation="relu")
        self.dropout = layers.Dropout(dropout_rate)

        if num_class==2:
            self.dense2 = layers.Dense(units=1, activation="sigmoid")
        else:
            self.dense2 = layers.Dense(units=num_class,activation="softmax")

    
    def call(self,inputs,training):
        x = self.embedding_layer(inputs)
        x1 = self.bigram(x)
        x1 = self.pool(x1)

        x2 = self.trigram(x)
        x2 = self.pool(x2)

        x3 = self.quadgram(x)
        x3 = self.pool(x3)

        concat_features = tf.concat([x1,x2,x3],axis=1)

        out =self.dense(concat_features)
        out = self.dropout(out,training)
        out = self.dense2(out)
        return out


In [63]:
VOCAB_SIZE = len(tokenizer.vocab)
EMBEDDING_DIM=256
NUM_FILTERS=128
DENSE_UNITS=512
NUM_CLASSES=3
DROPOUT_RATE = 0.2
NUM_EPOCHS=10

In [64]:
model = SentimentModel(vocab_size=VOCAB_SIZE,embedding_dim=EMBEDDING_DIM,num_filters=NUM_FILTERS,num_class=NUM_CLASSES, dropout_rate=DROPOUT_RATE)


In [65]:
if NUM_CLASSES==2:
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
else:
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["sparse_categorical_accuracy"])

In [66]:
checkpoint = "./bert_ckpt"
ckpt = tf.train.Checkpoint(SentimentModel=model)
ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint,max_to_keep=1)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)

In [67]:
#callback
class custom_callback(tf.keras.callbacks.Callback):
    def on_epoch_end(self,epoch,logs="None"):
        ckpt_manager.save()
        print("Checkpoint saved at {}".format(checkpoint))

In [69]:
model.fit(train_dataset, epochs=NUM_EPOCHS,callbacks=[custom_callback()])

Epoch 1/10
     61/Unknown - 10s 94ms/step - loss: 0.8676 - sparse_categorical_accuracy: 0.6003Checkpoint saved at ./bert_ckpt
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f437af27a90>

In [70]:
results = model.evaluate(val_dataset)



In [80]:
# prediction function
def predict(text):
    tokens = encode_sentences(text)
    #expanding dim for batch
    inputs = tf.expand_dims(tokens,0)
    output = model(inputs,training=False)
    sentiment = np.argmax(output)
    return(reverse_mapper[sentiment])

    

In [82]:
predict("The financial market is blooming, We can expect good outcomes. Yayy !")

2


'positive'

In [83]:
predict("The financial market is blooming !")

1


'neutral'