## All in one Notebook to create a tokenized dataset from the 2022 descriptions only

In [16]:
from typing import List

import pandas as pd
import seaborn as sns
from datasets import Dataset
from pandas import DataFrame
from pymongo import MongoClient
from transformers import AutoTokenizer

sns.set_style("white")

mongo_client = MongoClient("127.0.0.1")

mongo_db = mongo_client["nvd"]
mongo_collection = mongo_db["nvd_2022"]

In [17]:
data_descriptions: List = list(mongo_collection.aggregate([
    {
        '$match': {
            'cvssv3': {
                '$nin': [
                    'None', ''
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$description'
        }
    }, {
        '$project': {
            'year': 0,
            'reference_data': 0,
            'cwe': 0,
            'cvssv2': 0,
            'cpe': 0,
            'references': 0
        }
    }, {
        '$project': {
            'text': '$description',
            'cvssv3': 1
        }
    }
]))

In [18]:
len(data_descriptions)

5641

In [20]:
rows = []
data = data_descriptions
for row in data:
    id = row['_id']
    text = row['text']
    cvss = row['cvssv3']
    cvss_arr: List[str] = cvss.split('/')
    av: str = cvss_arr[1].replace('AV:', '')
    ac: str = cvss_arr[2].replace('AC:', '')
    pr: str = cvss_arr[3].replace('PR:', '')
    ui: str = cvss_arr[4].replace('UI:', '')
    s: str = cvss_arr[5].replace('S:', '')
    c: str = cvss_arr[6].replace('C:', '')
    i: str = cvss_arr[7].replace('I:', '')
    a: str = cvss_arr[8].replace('A:', '')
    score: float = float(cvss_arr[9].replace('Score:', ''))
    rows.append([id, text, av, ac, pr, ui, s, c, i, a, score])

df = pd.DataFrame(rows, columns=['id', 'text', 'av', 'ac', 'pr', 'ui', 's', 'c', 'i', 'a', 'score'])
df.to_csv('descriptions_2022.csv', index=False)
df.head()

Unnamed: 0,id,text,av,ac,pr,ui,s,c,i,a,score
0,CVE-2022-0001,Non-transparent sharing of branch predictor se...,L,L,L,N,C,H,N,N,6.5
1,CVE-2022-0002,Non-transparent sharing of branch predictor wi...,L,L,L,N,C,H,N,N,6.5
2,CVE-2022-0011,PAN-OS software provides options to exclude sp...,N,L,L,N,U,N,H,N,6.5
3,CVE-2022-0012,An improper link resolution before file access...,L,L,L,N,U,N,H,H,7.1
4,CVE-2022-0013,A file information exposure vulnerability exis...,L,L,L,N,U,H,N,N,5.5


In [21]:
#model = 'prajjwal1/bert-medium'
#model = 'prajjwal1/bert-small'
model = 'distilbert-base-uncased'
model_name = model.split('/')[-1]
model_name

'distilbert-base-uncased'

In [22]:
df: DataFrame = pd.read_csv('dataset/full_dataset_descriptions_2022.csv',)
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (5641, 11)


Unnamed: 0,id,text,av,ac,pr,ui,s,c,i,a,score
0,CVE-2022-0001,Non-transparent sharing of branch predictor se...,L,L,L,N,C,H,N,N,6.5
1,CVE-2022-0002,Non-transparent sharing of branch predictor wi...,L,L,L,N,C,H,N,N,6.5
2,CVE-2022-0011,PAN-OS software provides options to exclude sp...,N,L,L,N,U,N,H,N,6.5
3,CVE-2022-0012,An improper link resolution before file access...,L,L,L,N,U,N,H,H,7.1
4,CVE-2022-0013,A file information exposure vulnerability exis...,L,L,L,N,U,H,N,N,5.5


In [23]:
df.columns

Index(['id', 'text', 'av', 'ac', 'pr', 'ui', 's', 'c', 'i', 'a', 'score'], dtype='object')

In [24]:
test_dataset = Dataset.from_pandas(df)

In [25]:
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize_function(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True, max_length=512)


tokenized_test_set_2022 = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

In [26]:
tokenized_test_set_2022.to_json(f'dataset/{model_name}/descriptions_2022.json')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

15325580