# Create the full dataset
* Get raw data from MongoDB
* Save as csv

In [1]:
from typing import List
import pandas as pd
from pymongo import MongoClient
import seaborn as sns

sns.set_style("white")

mongo_client = MongoClient("127.0.0.1")

mongo_db = mongo_client["nvd"]
mongo_collection = mongo_db["nvd_all"]

In [2]:
data_descriptions: List = list(mongo_collection.aggregate([
    {
        '$match': {
            'cvssv3': {
                '$nin': [
                    'None', ''
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$description'
        }
    }, {
        '$project': {
            'year': 0,
            'reference_data': 0,
            'cwe': 0,
            'cvssv2': 0,
            'cpe': 0,
            'references': 0
        }
    }, {
        '$project': {
            'text': '$description',
            'cvssv3': 1
        }
    }
]))
data_references: List = list(mongo_collection.aggregate([
    {
        '$match': {
            'cvssv3': {
                '$nin': [
                    'None', ''
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$reference_data'
        }
    }, {
        '$replaceRoot': {
            'newRoot': {
                '$mergeObjects': [
                    '$$ROOT', '$reference_data'
                ]
            }
        }
    }, {
        '$match': {
            'text_selenium': {
                '$exists': 1
            }
        }
    }, {
        '$project': {
            'year': 0,
            'reference_data': 0,
            'cwe': 0,
            'cvssv2': 0,
            'cpe': 0,
            'references': 0,
            'description': 0,
            'url': 0,
            'name': 0,
            'refsource': 0,
            'scraped_selenium': 0,
            'tags': 0
        }
    }, {
        '$project': {
            'text': '$text_selenium',
            'cvssv3': 1
        }
    }
]))

In [3]:
len(data_descriptions)

5641

In [4]:
len(data_references)

0

In [5]:
rows = []
data = data_descriptions  + data_references
for row in data:
    id = row['_id']
    text = row['text']
    cvss = row['cvssv3']
    cvss_arr: List[str] = cvss.split('/')
    av: str = cvss_arr[1].replace('AV:', '')
    ac: str = cvss_arr[2].replace('AC:', '')
    pr: str = cvss_arr[3].replace('PR:', '')
    ui: str = cvss_arr[4].replace('UI:', '')
    s: str = cvss_arr[5].replace('S:', '')
    c: str = cvss_arr[6].replace('C:', '')
    i: str = cvss_arr[7].replace('I:', '')
    a: str = cvss_arr[8].replace('A:', '')
    score: float = float(cvss_arr[9].replace('Score:', ''))
    rows.append([id, text, av, ac, pr, ui, s, c, i, a, score])

df = pd.DataFrame(rows, columns=['id', 'text', 'av', 'ac', 'pr', 'ui', 's', 'c', 'i', 'a', 'score'])
df.to_csv('dataset/full_dataset_combined.csv', index=False)
df.head()

Unnamed: 0,id,text,av,ac,pr,ui,s,c,i,a,score
0,CVE-2022-0001,Non-transparent sharing of branch predictor se...,L,L,L,N,C,H,N,N,6.5
1,CVE-2022-0002,Non-transparent sharing of branch predictor wi...,L,L,L,N,C,H,N,N,6.5
2,CVE-2022-0011,PAN-OS software provides options to exclude sp...,N,L,L,N,U,N,H,N,6.5
3,CVE-2022-0012,An improper link resolution before file access...,L,L,L,N,U,N,H,H,7.1
4,CVE-2022-0013,A file information exposure vulnerability exis...,L,L,L,N,U,H,N,N,5.5


In [6]:
df.shape

(5641, 11)