In [None]:
!pip install transformers tokenizers datasets -q

In [None]:
import numpy as np
import random
import os
import tensorflow as tf
from transformers import set_seed

rand_seed = 9

def seed_everything(seed=0):
    random.seed(seed) # random
    os.environ['PYTHONHASHSEED'] = str(seed) # python enviroment
    np.random.seed(seed) # numpy
    tf.keras.utils.set_random_seed(seed) # tensorflow
    tf.random.set_seed(seed) # tensorflow
    set_seed(seed) # hugging_face transformer

seed_everything(rand_seed)

In [None]:
import datasets

data = datasets.load_dataset("raygx/NepCov19TweetsPlus").rename_columns({"Sentiment":"label","Sentences":"text"}).shuffle(999)['train'].train_test_split(test_size=0.2)
data


In [None]:
def LabelEncoding(x):
    if x['label']==-1:
        x['label'] =  2
    return x

data = data.map(
        LabelEncoding,
        num_proc=2)

data['train'][:5]

In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizerFast

model_id = 'raygx/BertClassifier4NepaliNews' # 'Shushant/nepaliBERT'

tokenizer = BertTokenizerFast.from_pretrained(model_id)
tokenizer

model = TFBertForSequenceClassification.from_pretrained(model_id,
            id2label={0:"NEUTRAL",1:"POSITIVE",2:"NEGATIVE"},
            label2id={"NEUTRAL":0,"POSITIVE":1,"NEGATIVE":2},ignore_mismatched_sizes=True)#,from_pt=True)

model.summary()

### Code For Using BERT as Embedding Layer

In [None]:
import tensorflow as tf

class BERTEmbedding(tf.keras.layers.Layer):
  def __init__(self, model,tokenizer, max_token_len=128,padding='max_length',truncation=True):
    super().__init__()
    self.embedding = model
    self.tokenizer = tokenizer
    self.max_token = max_token_len
    self.padding = padding
    self.truncation = truncation
    self.trainable=False

  def call(self, x):
    # if x.shape[0] != None:
    #   x = [str(i) for i in x]
    # else:
    #   x = str(x)
    return self.embedding(self.tokenizer(x,padding=self.padding,truncation=self.truncation,max_length=self.max_token,return_tensors='tf'))[1]

In [None]:
# model.bert(tokenizer('बैंक तथा वित्तीय संस्थामा देखिएको विश्वासको',padding='max_length',truncation=True,return_tensors='tf',max_length=128))
embd = BERTEmbedding(model.bert,tokenizer,max_token_len=100)
# embd(["",'बैंक तथा वित्तीय संस्थामा देखिएको विश्वासको','बैंक तथा वित्तीय संस्थामा देखिएको विश्वासको'])
# embd(["abcd"])

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, batch_size=32,shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.data = data
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.data.num_rows / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.data.num_rows)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        chunk = self.data.select(indexes)
        X = embd(chunk['text'])
        y = np.array(chunk['label'])
        return X,y

In [None]:
%%time

train_features = DataGenerator(data['train'])
test_features = DataGenerator(data['test'])

## Model Specification

In [None]:
sa_model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(768)),
    tf.keras.layers.Dense(768,activation='relu'),
    tf.keras.layers.Dense(3,activation='softmax')
])

In [None]:
sa_model.compile(
  optimizer=tf.keras.optimizers.Adam(
        learning_rate=2e-5,
        weight_decay=0.01
      ),
  loss='sparse_categorical_crossentropy',
  metrics=['acc'])

sa_model.summary()

In [None]:
history = sa_model.fit(train_features,
        epochs=30,
        validation_data=test_features,
        callbacks=[tf.keras.callbacks.EarlyStopping(
                            monitor='val_acc', patience=3,
                            verbose=1, mode='max',
                            restore_best_weights=True)
                        ])

## Evaluation Section

In [None]:
%%time
from transformers import pipeline, TextClassificationPipeline

print("Getting Test Prediction")
pred_labels = [np.argmax(tf.nn.softmax(model(tf.constant(x)).logits)) for x in tokenized_inputs['test']['input_ids']]

actual_labels = data['test']['labels']

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

print("F1-Score",f1_score(actual_labels,pred_labels,average='weighted'))
print("Precision-Score",precision_score(actual_labels,pred_labels,average='weighted'))
print("Recall-Score",recall_score(actual_labels,pred_labels,average='weighted'))
print("accuracy_Score",accuracy_score(actual_labels,pred_labels))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cmd = ConfusionMatrixDisplay(tf.math.confusion_matrix(actual_labels,pred_labels,num_classes=3).numpy())
cmd.plot()