In [11]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [12]:
!pip install -q transformers tensorflow_datasets

[K     |████████████████████████████████| 3.5 MB 7.0 MB/s 
[K     |████████████████████████████████| 6.8 MB 47.4 MB/s 
[K     |████████████████████████████████| 67 kB 5.9 MB/s 
[K     |████████████████████████████████| 895 kB 63.7 MB/s 
[K     |████████████████████████████████| 596 kB 66.6 MB/s 
[?25h

In [13]:
# import tensorflow_datasets as tfds
# (ds_train, ds_test), ds_info = tfds.load('imdb_reviews', 
#           split = (tfds.Split.TRAIN, tfds.Split.TEST),
#           as_supervised=True,
#           with_info=True)

import json
from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:

with open('/content/drive/MyDrive/bert/vul_dataset.json') as f:
  json_data = f.read()


[{'title': 'Multiple Vendor Linux Mountd Vulnerability', 'body': 'NFS servers running certain implementations of mountd, primarily Linux systems. On some systems, the vulnerable NFS server is enabled by default. This vulnerability can be exploited even if the NFS server does not share any file systems. This vulnerability is specifically a buffer overflow in the mount daemons logging code which is supposed to log unauthorized mount attempts.', 'id': 'fd65453815b27d480cd2880a19ea76a3dec91039f1bd7c616d20cb8fc35b422f', 'url': 'http://www.securityfocus.com/bid/121', 'year': 2002, 'impact': 10, 'vulnerabilityType': 'CWE-119', 'isNvdReport': False, 'referencesId': [], 'label': 1}, {'title': '', 'body': 'Buffer overflow in NFS mountd gives root access to remote attackers, mostly in Linux systems.', 'id': 'CVE-1999-0002', 'url': 'https://nvd.nist.gov/vuln/detail/CVE-1999-0002', 'year': 2002, 'impact': 10, 'vulnerabilityType': 'CWE-119', 'isNvdReport': True, 'referencesId': ['fd65453815b27d480cd

In [43]:
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
import numpy as np

ds = json.loads(json_data)
np.random.shuffle(ds)

ds_train, ds_test = train_test_split(ds, test_size=0.2)
print(len(ds_train))
print(len(ds_test))

for item in ds_train[1:3]:
  print(item)

for item in ds_test[1:3]:
  print(item)  

105176
26295
{'title': "trixbox 'index.php' Multple Cross-Site Scripting Vulnerabilities", 'body': "The 'trixbox' product is prone to multiple cross-site scripting vulnerabilities because the application fails to properly sanitize user-supplied input. An attacker may leverage these issues to execute arbitrary script code in the browser of an unsuspecting user in the context of the affected site. This may help the attacker steal cookie-based authentication credentials and launch other attacks.These issues affect trixbox 2.4.2.0; earlier versions may also be vulnerable.", 'id': '1b3d49ee5f11bc23ad1255bf2ce6ed751b17c4ce9beb4f6dcf6265d0350c8872', 'url': 'http://www.securityfocus.com/bid/27460', 'year': 2008, 'impact': 2.9, 'vulnerabilityType': 'CWE-79', 'isNvdReport': False, 'referencesId': [], 'label': 0}
{'title': '', 'body': 'D-Link DGS-1510-28XMP, DGS-1510-28X, DGS-1510-52X, DGS-1510-52, DGS-1510-28P, DGS-1510-28, and DGS-1510-20 Websmart devices with firmware before 1.31.B003 allow at

In [44]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [16]:
vocabulary = tokenizer.get_vocab()

print(list(vocabulary.keys())[5000:5010])

['knight', 'lap', 'survey', 'ma', '##ow', 'noise', 'billy', '##ium', 'shooting', 'guide']


In [17]:
# can be up to 512 for BERT
max_length = 512

# the recommended batches size for BERT are 16,32 ... however on this dataset we are overfitting quite fast 
# and smaller batches work like a regularization. 
# You might play with adding another dropout layer instead.

batch_size = 6

def convert_example_to_feature(review):
  
  # combine step for tokenization, WordPiece vector mapping and will add also special tokens and truncate reviews longer than our max length
  
  return tokenizer.encode_plus(review, 
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
                truncation = True, # to explicitly truncate examples to max length
              )
  
# print('encoded', convert_example_to_feature('test'))
# print('encoded', convert_example_to_feature('token'))
# print('encoded', convert_example_to_feature('tokenizer'))
# print('encoded', convert_example_to_feature('test tokenizer'))
# print('encoded', convert_example_to_feature('testtokenizer'))

for item in ds_test[1:3]:
  print('encoded', convert_example_to_feature(item['body']))

encoded {'input_ids': [101, 1037, 17698, 2058, 12314, 1999, 1996, 3027, 2361, 2862, 1006, 1048, 2015, 1007, 3094, 1999, 2462, 2015, 4473, 6556, 17857, 2000, 6204, 1037, 14920, 1997, 2326, 1998, 1010, 1999, 2070, 3572, 1010, 15389, 15275, 10954, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 



In [45]:
# map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def encode_examples(ds):

  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []

    
  for item in ds:
    bert_input = convert_example_to_feature(item['body'])
  
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([item['label']])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


  # encode_examples(ds_train[0:2])

In [46]:
# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)
# ds_train_encoded = encode_examples(ds_train).batch(batch_size)

# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)



In [47]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5

learning_rate = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1


# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# classifier Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded)


  752/17530 [>.............................] - ETA: 3:19:02 - loss: nan - accuracy: 0.1434