# ReadMe
The objective of this project is to read the National Vulnerability Database's (NVD) of Common exposures and vulnerabilites (CVES) and create a tool 
that is used as a part of the data pipeline to determine how similar CVE's are to one another. 

The first goal to such a pipeline is to clean the description data, tokenize the data. The second goal is tokenize and read the data that will be used as a training label. 

In [1]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488513 sha256=969086f1c8b3abf02ed31fc806c8c0ce1fbcec2faf9d1d2fcbbc4143f14ef924
  Stored in directory: /home/rsandor/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built 

In [23]:
import os, re
import pyspark
import urllib.request
import zipfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import  col, concat_ws, explode, countDistinct, udf, split
from pyspark.sql.types import StringType, ArrayType


In [24]:
# Load the dataset if it doesn't already exist. 
data_dir = "data"
os.makedirs(data_dir, exist_ok=True) 
fileUrls = [
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2002.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2003.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2004.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2005.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2006.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2007.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2008.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2009.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2010.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2011.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2012.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2013.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2014.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2015.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2016.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2017.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2018.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2019.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2020.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2021.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2022.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2023.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2024.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-recent.json.zip'
    ]

# Iterate through each URL
for url in fileUrls:
    filename = url.split("/")[-1]
    outputfile = os.path.join(data_dir, filename)
    checkfile = os.path.join(data_dir, os.path.splitext(filename)[0])

    # Check if the file already exists
    if not os.path.exists(checkfile):
        # Download the file
        urllib.request.urlretrieve(url, outputfile)
        print(f"Downloaded: {filename}")

        # Extract the file
        with zipfile.ZipFile(outputfile, "r") as zip_ref:
            zip_ref.extractall(data_dir)

        # Delete the original zip file
        os.remove(outputfile)



In [25]:
spark = (
    SparkSession.builder
        .master("local[*]")
        .appName("voltcve")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.driver.bindAddress", "127.0.0.1")
        .config("spark.default.parallelism", 8)
        .getOrCreate()
)
# read the data
cves = spark.read.option("multiline", "true").json("data/nvdcve-1.1-2020.json")

# Manipulate the data to be more usable 
exploded = cves.select(explode(col("CVE_Items")).alias("cves"))

# Now 'exploded' contains individual rows for each 'CVE_Item'
exploded.show()

descr_df = exploded.select(col("cves.cve.CVE_data_meta.ID").alias("id"),
          col("cves.cve.description.description_data.value").alias("description"));

descr_df = descr_df.withColumn("description_single", concat_ws(" ", descr_df["description"]))


doc_count = descr_df.selectExpr("count(distinct id)").first()[0]
print("Number of docs: {}".format(doc_count))

                                                                                

+--------------------+
|                cves|
+--------------------+
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
|{{4.0, [{[], [{cp...|
+--------------------+
only showing top 20 rows



[Stage 2:>                                                          (0 + 1) / 1]

Number of docs: 20453


                                                                                

In [27]:
@udf
def string_cleaner(input_str):
    # 1. Replace all "." or ':' followed by whitespace with an empty string.
    # a. Remove ending periods.
    # 2. Remove trademark, rights.
    # 3. Grab cotent in parentheses only.
    # 4. Remove some punctuation.
    cleaned_text = re.sub(r"[.:,]+\s+", " ", input_str)
    # Remove trailing periods
    cleaned_text = re.sub(r"\.$", "", cleaned_text)
    # Remove apostrophes, (TM), (R), parentheses, and double quotes
    cleaned_text = re.sub(r"\'|\(TM\)|\(R\)|\(|\)|\"", "", cleaned_text)
    # Convert to lowercase and strip leading/trailing spaces
    cleaned_text = cleaned_text.strip().lower()
    return cleaned_text


test = "Beautiful day."
print(string_cleaner2(test))

clean_tokens = descr_df.withColumn("tokens", explode(split(string_cleaner(col("description_single")), " " )))
clean_tokens = clean_tokens.select("id", "tokens")
clean_tokens = clean_tokens.filter(col("tokens").isNotNull())

# Show the resulting DataFrame
descr_df.show()

clean_tokens.show()


beautiful day


                                                                                

+-------------+--------------------+--------------------+
|           id|         description|  description_single|
+-------------+--------------------+--------------------+
|CVE-2020-0001|[In getProcessRec...|In getProcessReco...|
|CVE-2020-0002|[In ih264d_init_d...|In ih264d_init_de...|
|CVE-2020-0003|[In onCreate of I...|In onCreate of In...|
|CVE-2020-0004|[In generateCrop ...|In generateCrop o...|
|CVE-2020-0005|[In btm_read_remo...|In btm_read_remot...|
|CVE-2020-0006|[In rw_i93_send_c...|In rw_i93_send_cm...|
|CVE-2020-0007|[In flattenString...|In flattenString8...|
|CVE-2020-0008|[In LowEnergyClie...|In LowEnergyClien...|
|CVE-2020-0009|[In calc_vm_may_f...|In calc_vm_may_fl...|
|CVE-2020-0010|[In fpc_ta_get_bu...|In fpc_ta_get_bui...|
|CVE-2020-0011|[In get_auth_resu...|In get_auth_resul...|
|CVE-2020-0012|[In fpc_ta_pn_get...|In fpc_ta_pn_get_...|
|CVE-2020-0014|[It is possible f...|It is possible fo...|
|CVE-2020-0015|[In onCreate of C...|In onCreate of Ce...|
|CVE-2020-0016

[Stage 11:>                                                         (0 + 1) / 1]

+-------------+--------------------+
|           id|              tokens|
+-------------+--------------------+
|CVE-2020-0001|                  in|
|CVE-2020-0001|getprocessrecordl...|
|CVE-2020-0001|                  of|
|CVE-2020-0001|activitymanagerse...|
|CVE-2020-0001|            isolated|
|CVE-2020-0001|                apps|
|CVE-2020-0001|                 are|
|CVE-2020-0001|                 not|
|CVE-2020-0001|             handled|
|CVE-2020-0001|           correctly|
|CVE-2020-0001|                this|
|CVE-2020-0001|               could|
|CVE-2020-0001|                lead|
|CVE-2020-0001|                  to|
|CVE-2020-0001|               local|
|CVE-2020-0001|          escalation|
|CVE-2020-0001|                  of|
|CVE-2020-0001|           privilege|
|CVE-2020-0001|                with|
|CVE-2020-0001|                  no|
+-------------+--------------------+
only showing top 20 rows



                                                                                

In [16]:
spark.stop()