In [1]:
# Initialization
####################################
!pip install transformers

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Found GPU at: /device:GPU:0
Mon May  1 04:52:15 2023 

In [6]:
# Configs
####################################

### Dataset config

# hackerone-report-20types-weakness.csv
# hackerone-report-20types-bounty_range.csv
# hackerone-report-20types-severity.csv

# weakness clasification
# classes = ['CWE-77', 'CWE-22', 'CWE-310', 'CWE-89', 'CWE-918', 'CWE-639', 'CWE-103', 'CWE-400', 'CWE-287', 'CWE-657', 'CWE-79', 'CWE-284', 'CWE-352', 'CWE-200', 'CWE-840', 'CWE-233', 'CWE-94', 'CWE-601', 'CWE-119', 'other'];
# datasetUrl = 'https://raw.githubusercontent.com/SamanShafigh/vulBERT/main/hackerone-report-20types-weakness.csv';

# # bounty_range clasification
classes = ['undefined', 'low', 'medium', 'high', 'extreme'];
datasetUrl = 'https://raw.githubusercontent.com/SamanShafigh/vulBERT/main/hackerone-report-20types-bounty_range.csv';

# severity clasification
# classes = ['undefined', 'low', 'medium', 'high', 'critical'];
# datasetUrl = 'https://raw.githubusercontent.com/SamanShafigh/vulBERT/main/hackerone-report-20types-severity.csv';


### Training config
epochs = 20


### **Data Preparation**

In [7]:
# Load dataset
####################################

df = pd.read_csv(datasetUrl, sep='\t')
numberOfClasses = len(classes)
print(numberOfClasses)

df.head()
df.info()
df.drop(len(df)-1, inplace=True)
df['type'] = df['type'].astype(int)
df['type'].value_counts()

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


# Define train & val dataset
####################################
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['report'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

print("generate training data")
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer) 
labels = np.zeros((len(df), numberOfClasses))
labels.shape
labels[np.arange(len(df)), df['type'].values] = 1 # one-hot encoded target tensor

# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 
dataset.take(1)

dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

print("Train size: " + str(train_size))

5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7421 entries, 0 to 7420
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   report  7421 non-null   object
 1   type    7421 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 116.1+ KB
generate training data


0it [00:00, ?it/s]

Train size: 370


### **Model**

In [8]:
# Setup BERT model
####################################
from transformers import TFBertModel
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(numberOfClasses, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)

loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [9]:
# Train model
####################################

hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
sentiment_model.save('top15_vul_dataset_nvd_only_vulBERT_v1')



### **Prediction**

In [None]:
sentiment_model = tf.keras.models.load_model('vulBERT_v1')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [None]:
input_text = input('Enter movie review here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(sentiment_model, processed_data=processed_data, classes)
print(f"Predicted Sentiment: {result}")

Enter movie review here: kmMail does not sufficiently sanitize HTML and script code from the body of e-mail messages.  As a result, an attacker may send a malicious message to a user of kmMail that includes arbitrary HTML and script code.This may allow an attacker to steal cookie-based authentication credentials from users of the webmail system.  Other attacks are also possible.
Predicted Sentiment: CWE-79
