# Tokenize and Split Dataset

* Read full dataset from csv
* Split into Test/Train
* Tokenize
* Save

### Setup

In [1]:
import pandas as pd
from pandas import DataFrame
from datasets import Dataset
from transformers import AutoTokenizer

In [2]:
#model = 'prajjwal1/bert-medium'
#model = 'prajjwal1/bert-small'
model = 'distilbert-base-uncased'
model_name = model.split('/')[-1]
model_name

'distilbert-base-uncased'

### Read csv

In [3]:
df: DataFrame = pd.read_csv('dataset/full_dataset_combined.csv')
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (101734, 11)


Unnamed: 0,id,text,av,ac,pr,ui,s,c,i,a,score
0,CVE-2021-0001,Observable timing discrepancy in Intel(R) IPP ...,L,H,L,N,U,H,N,N,4.7
1,CVE-2021-0002,Improper conditions check in some Intel(R) Eth...,L,L,L,N,U,H,N,H,7.1
2,CVE-2021-0003,Improper conditions check in some Intel(R) Eth...,L,L,L,N,U,H,N,N,5.5
3,CVE-2021-0004,Improper buffer restrictions in the firmware o...,L,L,H,N,U,N,N,H,4.4
4,CVE-2021-0005,Uncaught exception in firmware for Intel(R) Et...,L,L,H,N,U,N,N,H,4.4


In [4]:
df.columns

Index(['id', 'text', 'av', 'ac', 'pr', 'ui', 's', 'c', 'i', 'a', 'score'], dtype='object')

In [5]:
df.shape

(101734, 11)

### Split

In [6]:
TEST_SPLIT = 25

df_group: DataFrame = df.groupby(['id'])
df_group.head()

Unnamed: 0,id,text,av,ac,pr,ui,s,c,i,a,score
0,CVE-2021-0001,Observable timing discrepancy in Intel(R) IPP ...,L,H,L,N,U,H,N,N,4.7
1,CVE-2021-0002,Improper conditions check in some Intel(R) Eth...,L,L,L,N,U,H,N,H,7.1
2,CVE-2021-0003,Improper conditions check in some Intel(R) Eth...,L,L,L,N,U,H,N,N,5.5
3,CVE-2021-0004,Improper buffer restrictions in the firmware o...,L,L,H,N,U,N,N,H,4.4
4,CVE-2021-0005,Uncaught exception in firmware for Intel(R) Et...,L,L,H,N,U,N,N,H,4.4
...,...,...,...,...,...,...,...,...,...,...,...
101729,CVE-2016-9989,IBM Jazz Foundation Reporting Service (JRS) is...,N,L,L,R,C,L,L,N,5.4
101730,CVE-2016-9991,IBM Sterling Order Management is vulnerable to...,N,L,L,R,U,H,H,H,8.0
101731,CVE-2016-9992,IBM Kenexa LCMS Premier on Cloud is vulnerable...,N,L,L,N,U,H,L,N,7.1
101732,CVE-2016-9993,IBM Kenexa LCMS Premier on Cloud is vulnerable...,N,L,L,N,U,H,L,N,7.1


In [7]:
train_rows = []
test_rows = []
for id, group in df_group:
    ratio = len(train_rows) / (len(test_rows) + len(train_rows)) if len(train_rows) else 0
    l = group.values.tolist()
    if ratio > 0.75:
        for r in l:
            test_rows.append(r)
    else:
        for r in l:
            train_rows.append(r)


df_train = pd.DataFrame(train_rows, columns=['id', 'text', 'av', 'ac', 'pr', 'ui', 's', 'c', 'i', 'a', 'score'])
df_test = pd.DataFrame(test_rows, columns=['id', 'text', 'av', 'ac', 'pr', 'ui', 's', 'c', 'i', 'a', 'score'])
print(df_train.shape)
print(df_test.shape)
df_test.head()

(76300, 11)
(25434, 11)


Unnamed: 0,id,text,av,ac,pr,ui,s,c,i,a,score
0,CVE-2016-0003,Microsoft Edge allows remote attackers to exec...,N,L,N,R,C,H,H,H,9.6
1,CVE-2016-0003,This vulnerability allows remote attackers to ...,N,L,N,R,C,H,H,H,9.6
2,CVE-2016-0011,Microsoft SharePoint Server 2013 SP1 and Share...,N,L,L,R,C,L,L,N,5.4
3,CVE-2016-0016,"Microsoft Windows Vista SP2, Windows Server 20...",L,L,L,N,U,H,H,H,7.8
4,CVE-2016-0021,"Microsoft InfoPath 2007 SP3, 2010 SP2, and 201...",L,L,N,R,U,H,H,H,7.8


In [8]:
df_train.shape[0] / (df_train.shape[0] + df_test.shape[0])

0.7499950852222462

### Create Hf Dataset and Tokenize

In [9]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize_function(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True, max_length=512)


tokenized_train_set = train_dataset.map(tokenize_function, batched=True)
tokenized_test_set = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/67 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

In [15]:
print(len(tokenized_train_set[0]['input_ids']))
tokenized_train_set[0]

512


{'id': 'CVE-2016-0002',
 'text': 'The Microsoft (1) VBScript 5.7 and 5.8 and (2) JScript 5.7 and 5.8 engines, as used in Internet Explorer 8 through 11 and other products, allow remote attackers to execute arbitrary code via a crafted web site, aka "Scripting Engine Memory Corruption Vulnerability."',
 'av': 'N',
 'ac': 'H',
 'pr': 'N',
 'ui': 'R',
 's': 'U',
 'c': 'H',
 'i': 'H',
 'a': 'H',
 'score': 7.5,
 'input_ids': [101,
  1996,
  7513,
  1006,
  1015,
  1007,
  1058,
  5910,
  23235,
  1019,
  1012,
  1021,
  1998,
  1019,
  1012,
  1022,
  1998,
  1006,
  1016,
  1007,
  1046,
  22483,
  1019,
  1012,
  1021,
  1998,
  1019,
  1012,
  1022,
  5209,
  1010,
  2004,
  2109,
  1999,
  4274,
  10566,
  1022,
  2083,
  2340,
  1998,
  2060,
  3688,
  1010,
  3499,
  6556,
  17857,
  2000,
  15389,
  15275,
  3642,
  3081,
  1037,
  19275,
  4773,
  2609,
  1010,
  9875,
  1000,
  5896,
  2075,
  3194,
  3638,
  7897,
  18130,
  1012,
  1000,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


### Save Datasets

In [16]:
tokenized_train_set.to_json(f'dataset/{model_name}/train_descriptions_only.json')
tokenized_test_set.to_json(f'dataset/{model_name}/test_descriptions_only.json')

Creating json from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

61254783