In [1]:
from transformers import BertTokenizer, BertConfig, TFBertModel
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
MAX_LEN=32

In [3]:
model_name = 'bert-base-uncased'

# Defining RoBERTa tokinizer
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True)

In [4]:
tokenizer.tokenize('Testing something new. Hello dear')

['testing', 'something', 'new', '.', 'hello', 'dear']

In [5]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

In [6]:
config = BertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

In [7]:
test = pd.read_csv('../dataset/test_processed.csv')
test_df = test[test['class'] != -1]

In [8]:
def preapre_environment():
    physical_devices = tf.config.list_physical_devices('GPU')
    # print(physical_devices)

    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        # print('Tensorflow sets memory gropth to True')
    except:
        # Invalid device or cannot modify virtual devices once initialized.
        pass

In [9]:
preapre_environment()

In [10]:
test_ids = np.load('../dataset/test_ids_32.npy')
test_masks = np.load('../dataset/test_masks_32.npy')
test_segment = np.load('../dataset/test_segment_32.npy')

In [11]:
y_test = test_df["class"].values

In [12]:
input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="segment_ids")

last_hidden_state, pooler_output = transformer_model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids)

x = tf.keras.layers.Dense(256, activation = "relu")(pooler_output)
x = tf.keras.layers.Dense(1, activation = "sigmoid")(x)

model = tf.keras.Model(inputs=[input_ids, input_mask, segment_ids], outputs = x)

for layer in model.layers[:4]:
    layer.trainable = False

In [13]:
model.load_weights('model_longer.h5')

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 32)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 32)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 32, 768), (N 109482240   input_ids[0][0]                  
______________________________________________________________________________________________

In [16]:
model.compile(tf.keras.optimizers.Adam(lr=2e-6), loss="binary_crossentropy", metrics=["accuracy"])

In [21]:
toxic_ids = test_ids[test_df['class'] == 1]
toxic_masks = test_masks[test_df['class'] == 1]
toxic_segment = test_segment[test_df['class'] == 1]
toxic_y = y_test[test_df['class'] == 1]

In [23]:
non_toxic_ids = test_ids[test_df['class'] == 0]
non_toxic_masks = test_masks[test_df['class'] == 0]
non_toxic_segment = test_segment[test_df['class'] == 0]
non_toxic_y = y_test[test_df['class'] == 0]

In [22]:
model.evaluate((toxic_ids, toxic_masks, toxic_segment), toxic_y)



[0.6375705003738403, 0.7701425552368164]

In [None]:
model.evaluate((non_toxic_ids, non_toxic_masks, non_toxic_segment), non_toxic_y)

 216/1805 [==>...........................] - ETA: 3:37 - loss: 0.2047 - accuracy: 0.9197

In [17]:
model.evaluate((test_ids, test_masks, test_segment), y_test)

  85/2000 [>.............................] - ETA: 4:21 - loss: 0.2389 - accuracy: 0.9074

KeyboardInterrupt: 