In [None]:
!pip install transformers seqeval[gpu]



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AutoTokenizer, AutoModelForTokenClassification

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
data = pd.read_csv("ner_formatted.csv")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Subject:,NONE,O
1,,Blind,NONE,B-Vulnerability_Type
2,,SQL,NONE,I-Vulnerability_Type
3,,injection,NONE,I-Vulnerability_Type
4,,in,NONE,O


In [None]:
rule_data = pd.read_csv("ner_long_formatted.csv")
rule_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Subject:,NONE,O
1,,Blind,NONE,O
2,,SQL,NONE,O
3,,injection,NONE,O
4,,in,NONE,O


In [None]:
data.count()

Sentence #        999
Word          1156426
POS           1156739
Tag           1156739
dtype: int64

In [None]:
rule_data.count()

Sentence #        999
Word          1156426
POS           1156739
Tag           1156739
dtype: int64

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Subject:,NONE,O
1,Sentence: 1,Blind,NONE,B-Vulnerability_Type
2,Sentence: 1,SQL,NONE,I-Vulnerability_Type
3,Sentence: 1,injection,NONE,I-Vulnerability_Type
4,Sentence: 1,in,NONE,O


In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
rule_data = rule_data.fillna(method='ffill')
rule_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Subject:,NONE,O
1,Sentence: 1,Blind,NONE,O
2,Sentence: 1,SQL,NONE,O
3,Sentence: 1,injection,NONE,O
4,Sentence: 1,in,NONE,O


In [None]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,Subject:,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,B-Vulnerability_Type,I-Vulnerability_Type,I-..."
1,Sentence: 1,Blind,NONE,B-Vulnerability_Type,Subject: Blind SQL injection in WordPress Meta...,"O,B-Vulnerability_Type,I-Vulnerability_Type,I-..."
2,Sentence: 1,SQL,NONE,I-Vulnerability_Type,Subject: Blind SQL injection in WordPress Meta...,"O,B-Vulnerability_Type,I-Vulnerability_Type,I-..."
3,Sentence: 1,injection,NONE,I-Vulnerability_Type,Subject: Blind SQL injection in WordPress Meta...,"O,B-Vulnerability_Type,I-Vulnerability_Type,I-..."
4,Sentence: 1,in,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,B-Vulnerability_Type,I-Vulnerability_Type,I-..."


In [None]:
# let's create a new column called "sentence" which groups the words by sentence
rule_data['sentence'] = rule_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
rule_data['word_labels'] = rule_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
rule_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,Subject:,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Sentence: 1,Blind,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Sentence: 1,SQL,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Sentence: 1,injection,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Sentence: 1,in,NONE,O,Subject: Blind SQL injection in WordPress Meta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
max = 0
for sent in rule_data['word_labels'].tolist():
  if (curr := len(sent.split(','))) > max:
    max = curr
    print(max)
max

66
3181
3853
6884
7535
7823
17563


17563

In [None]:
label2id = {k: v for v, k in enumerate(data.Tag.unique())}
id2label = {v: k for v, k in enumerate(data.Tag.unique())}
label2id

{'O': 0,
 'B-Vulnerability_Type': 1,
 'I-Vulnerability_Type': 2,
 'B-Vendor': 3,
 'B-Product': 4,
 'I-Product': 5,
 'B-Version_Number': 6,
 'B-Published_dates': 7,
 'I-Published_dates': 8,
 'B-Steps_to_Reproduce': 9,
 'I-Steps_to_Reproduce': 10,
 'B-Proof_of_Concept': 11,
 'I-Proof_of_Concept': 12,
 'B-CVE': 13,
 'B-Host_Information': 14,
 'I-Host_Information': 15,
 'B-Remote/Local': 16,
 'I-Version_Number': 17,
 'B-Impact': 18,
 'I-Impact': 19,
 'B-Risk': 20,
 'I-Risk': 21,
 'I-Vendor': 22,
 'I-CVE': 23,
 'I-Remote/Local': 24}

In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Subject: Blind SQL injection in WordPress Meta...,"O,B-Vulnerability_Type,I-Vulnerability_Type,I-..."
1,1742334 - (CVE-2022-22740) Use-after-free of C...,"O,O,O,B-CVE,O,O,O,O,O,B-CVE,O,O,O,O,O,O,O,O,O,..."
2,Joomla (< 3.6.4) Account Creation/Elevated Pri...,"B-Product,O,B-Version_Number,B-Vulnerability_T..."
3,Teltonika Gateway TRB245 Multiple Vulnerabilit...,"B-Proof_of_Concept,I-Proof_of_Concept,I-Proof_..."
4,OpenWRT code-execution bug puts millions of de...,"B-Product,B-Vulnerability_Type,I-Vulnerability..."


In [None]:
rule_data = rule_data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
rule_data.head()

Unnamed: 0,sentence,word_labels
0,Subject: Blind SQL injection in WordPress Meta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,1742334 - (CVE-2022-22740) Use-after-free of C...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Joomla (< 3.6.4) Account Creation/Elevated Pri...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Teltonika Gateway TRB245 Multiple Vulnerabilit...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,OpenWRT code-execution bug puts millions of de...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
MAX_LEN = 17563
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('jackaduma/SecBERT')

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
true_set = dataset(data, tokenizer, MAX_LEN)
rule_set = dataset(rule_data, tokenizer, MAX_LEN)

In [None]:
true_set[0]

{'ids': tensor([   2, 3522,   30,  ...,    0,    0,    0]),
 'mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'targets': tensor([0, 0, 0,  ..., 0, 0, 0])}

In [None]:
rule_set[0]

{'ids': tensor([   2, 3522,   30,  ...,    0,    0,    0]),
 'mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'targets': tensor([0, 0, 0,  ..., 0, 0, 0])}

In [None]:
labels = [true_set[i]['targets'].tolist() for i in range(len(true_set))]

In [None]:
predictions = [rule_set[i]['targets'].tolist() for i in range(len(rule_set))]

In [None]:
labels = []
for i in range(len(true_set)):
  for id in true_set[i]['targets'].tolist():
    labels.append(id2label[id])

In [None]:
for i, tag in enumerate(labels):
  if tag.startswith('B-Proof'):
    print(i)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
13453804
13453805
13453806
13453807
13453808
13453809
13453810
13453811
13453812
13453813
13453814
13453815
13453816
13453817
13453818
13453819
13453820
13453821
13453822
13453823
13453824
13453825
13453826
13453827
13453828
13453829
13453830
13453831
13453832
13453833
13453834
13453835
13453836
13453837
13453838
13453839
13453840
13453841
13453842
13471798
13471799
13471800
13489372
13524127
13524128
13577225
13594054
13594055
13594179
13594180
13594301
13611331
13613083
13613357
13613358
13613359
13613360
13613361
13613362
13613363
13613364
13613365
13613366
13613367
13613368
13613369
13613370
13613371
13613372
13613373
13613374
13613375
13613376
13613377
13613378
13613379
13613380
13613381
13613382
13613383
13613384
13613385
13613386
13613387
13613388
13613389
13613390
13613391
13613392
13613393
13613594
13613595
13613596
13613597
13613598
13613599
13613600
13613601
13613602
13613603
13613604
13613605
13613606
13613607

In [None]:
predictions = []
for i in range(len(rule_set)):
  for id in rule_set[i]['targets'].tolist():
    predictions.append(id2label[id])

In [None]:
len(rule_set)

999

In [None]:
for i, tag in enumerate(predictions):
  if tag.startswith('B-Proof'):
    print(i)

19235
19236
19237
19238
19239
19240
19241
19242
19243
19244
19245
19246
19247
19248
19249
19250
19251
19315
19316
19317
19318
19319
19320
19321
19322
19323
19324
19325
19326
19327
19328
19329
19330
19331
19840
19841
19842
19843
19844
19845
19846
19847
19848
19849
19850
19851
19852
19853
19854
19855
27638
27639
27640
27641
27642
27643
27644
27645
27646
27647
27648
27649
27650
27651
27652
27653
53270
53271
53272
53273
53274
53531
53985
53986
53987
53988
54612
89140
141285
141286
141287
177093
177094
177095
177096
177097
177098
177099
177100
177101
177102
177103
177104
177105
177106
177107
177108
177109
177176
177177
177178
177179
177180
177181
177182
177183
177184
177185
177186
177187
177188
177189
177190
177191
264214
264215
264216
579924
579925
579926
579927
579928
579929
773379
773380
773381
775247
775248
775249
813410
813411
813412
966560
966561
966562
984396
984397
984398
984399
984400
984401
984402
984403
984404
984405
984406
984407
984408
984409
984410
984411
984412
984413
984414


In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

               CVE       0.00      0.00      0.00     28519
  Host_Information       0.00      0.00      0.00      1097
            Impact       0.09      0.02      0.03      1129
           Product       0.00      0.00      0.00     12869
  Proof_of_Concept       0.29      0.05      0.09     19159
   Published_dates       0.00      0.00      0.00      3658
      Remote/Local       0.00      0.00      0.00       433
              Risk       0.00      0.00      0.00      1772
Steps_to_Reproduce       0.36      0.01      0.02      3263
            Vendor       0.00      0.00      0.00      3519
    Version_Number       0.00      0.00      0.00     10709
Vulnerability_Type       0.00      0.00      0.00      7105

         micro avg       0.28      0.01      0.02     93232
         macro avg       0.06      0.01      0.01     93232
      weighted avg       0.07      0.01      0.02     93232

