In [1]:
!pip install bert-for-tf2
!pip install sentencepiece

## Imports

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn import preprocessing
from bert import bert_tokenization
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.5.0
Hub version:  0.12.0


## Loading Dataset

In [3]:
df_train = pd.read_excel('/content/drive/MyDrive/Datasets/Tamil__hasoc_train.xlsx',names=["ID","Tweets","Labels"])
df_train.dropna(inplace=True)
df_train.reset_index(drop=True, inplace=True)

df_val = pd.read_csv("/content/drive/MyDrive/Datasets/Tamil_hasoc_dev.tsv", sep="\t",names=["ID","Tweets","Labels"])
df_val.dropna(inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [10]:
df_train.Tweets =  df_train.Tweets.apply(preprocessing)
df_val.Tweets =  df_val.Tweets.apply(preprocessing)
# df_test.Tweets =  df_test.Tweets.apply(preprocessing)

## Mapping the labels correctly 

In [12]:
df_train.Labels = df_train.Labels.map({'not': 'NOT', 'OFf': 'OFF','NOT': 'NOT', 'OFF': 'OFF'})

## Label Encoding

In [15]:
unique_labels = list(np.unique(df_train["Labels"]))

train_x = df_train["Tweets"].values
train_y = df_train["Labels"].values

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_y = le.fit_transform(train_y)
train_y = tf.keras.utils.to_categorical(train_y, num_classes=len(unique_labels), dtype='float32')

val_x = df_val["Tweets"].values
val_y = df_val["Labels"].values

val_y = le.fit_transform(val_y)
val_y = tf.keras.utils.to_categorical(val_y, num_classes=len(unique_labels), dtype='float32')


print("number of unique labels", len(unique_labels))

number of unique labels 2


## Helper Functions

In [16]:
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

# Function to create attention masks
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

# Function to create segment ids
def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

# Function to create input_ids, attention_masks, segment_ids for sample
def create_single_input(sentence,MAX_LEN, MAX_SEQ_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

def create_input_array(sentences, MAX_SEQ_LEN):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2, MAX_SEQ_LEN)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

## Downloading the MuRIL model from TFHub

In [17]:
muril_layer = hub.KerasLayer("https://tfhub.dev/google/MuRIL/1", trainable=True)

# Create tokenizer
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [18]:
max_seq_len = 120
train_x = create_input_array(train_x, max_seq_len)
val_x = create_input_array(val_x, max_seq_len)

100%|██████████| 3999/3999 [00:01<00:00, 2272.06it/s]
100%|██████████| 940/940 [00:00<00:00, 1556.90it/s]


## Defining the F1 metric

In [19]:
from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### downloading model

In [21]:
!pip install tf-models-official



## Defining the model

In [22]:
input_word_ids = tf.keras.layers.Input(shape=(120,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(120,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(120,), dtype=tf.int32,
                                    name="segment_ids")
  
outputs = muril_layer(dict(input_word_ids = input_word_ids, input_mask = input_mask, input_type_ids = segment_ids))
x = tf.keras.layers.Dropout(0.2)(outputs["pooled_output"]) # take pooled output layer
final_output = tf.keras.layers.Dense(2, activation="sigmoid", name="dense_output")(x)

model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=final_output)

    
#   optimizer = 
model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  metrics=['accuracy',f1_m])




In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_mask (InputLayer)         [(None, 120)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 120)]        0                                            
__________________________________________________________________________________________________
input_word_ids (InputLayer)     [(None, 120)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'encoder_outputs':  237556225   input_mask[0][0]                 
                                                                 segment_ids[0][0]            

## Making checkpoint

In [28]:
metric = 'val_f1_m'
model_save_path='....'
import keras
callbacks = [keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor=metric,mode='max',save_best_only=True)]

In [25]:
num_epochs = 15

# Get the model object
history = model.fit(train_x, train_y, epochs = num_epochs, batch_size = 50, validation_data = (val_x, val_y),callbacks=callbacks)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [29]:
from sklearn.metrics import classification_report
model2.load_weights(model_save_path)
preds = model.predict(val_x)>0.5
print(classification_report(val_y, preds))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91       465
           1       0.92      0.91      0.92       475

   micro avg       0.91      0.91      0.91       940
   macro avg       0.91      0.91      0.91       940
weighted avg       0.91      0.91      0.91       940
 samples avg       0.91      0.91      0.91       940



In [27]:
# model2 = tf.keras.models.load_model('/tahsoc91', custom_objects={'f1_m':f1_m})