# Creating Inputs for Network Visualization in D3.js and Gephi
## Using NIST National Vulnerabilities Database

We first import the necessary packages to process the NIST National Vulnerabilities Database (NVD) for 2015. The xml file is available here: [NIST NVD Website](https://nvd.nist.gov/download.cfm#CVE_FEED). The packages used are:
    1. re for parsing the xml with regex
    2. csv for exporting to comma separated variable file
    3. json for exporting to JSON
    4. datetime for determining week number of year and day number of week from timestamp
    5. (Optional) tqdm for creating a cheap progress bar on an iterable.

In [1]:
import re, csv, json, datetime
from tqdm import *

Define a function that can parse the NIST NVD xml file and extract the relevant information. More fields are defined and created than is needed for Gephi or the D#.js visualization. The function returns a dictionary by vulnerability key as defined by https://cve.mitre.org/. All additional information is then stored in a dictionary for that particular vulnerability.

In [2]:
def pull_nvd(fname):
    print 'Loading NVD Dataset'
    nvd_dict = {}
    with open(fname) as f:
        for ln in tqdm(f):
            if 'entry id=' in ln:
                vuln_id = re.search('CVE.\d\d\d\d.\d\d\d\d', ln).group(0)
                # print '************************%s************************'%vuln_id
                nvd_dict[vuln_id] = {}
            elif 'cpe-lang:fact-ref name=' in ln:
                nvd_dict[vuln_id]['vuln_os'] = []
                nvd_dict[vuln_id]['vuln_os_plat'] = []
                vuln_os = re.search('\".*\"', ln).group(0)[8:-1]
                vuln_os_plat = re.split('[:\"]', ln)
                # print '--->%s'%vuln_os
                nvd_dict[vuln_id]['vuln_os'].append(re.sub('_',' ',vuln_os_plat[4]))
                nvd_dict[vuln_id]['vuln_os_plat'].append(re.sub('_',' ',vuln_os_plat[4])+' '+re.sub('_',' ',vuln_os_plat[5]))
            elif '<vuln:published-datetime>' in ln:
                vuln_date = re.search('2015.*<', ln).group(0)[:-1]
                date = [int(vuln_date[0:4]),int(vuln_date[5:7]),int(vuln_date[8:10])]
                date_num = datetime.date(date[0],date[1],date[2]).isocalendar()
                # print '    --->%s'%vuln_date
                nvd_dict[vuln_id]['vuln_date'] = date_num
            elif '<cvss:score>' in ln:
                vuln_score = re.search('\d?\d.\d', ln).group(0)
                # print '    --->%s'%vuln_score
                nvd_dict[vuln_id]['vuln_score'] = vuln_score
            elif '<cvss:access-vector>' in ln:
                vuln_vector = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_vector
                nvd_dict[vuln_id]['vuln_vector'] = vuln_vector
            elif '<cvss:access-complexity>' in ln:
                vuln_compl = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_compl
                nvd_dict[vuln_id]['vuln_compl'] = vuln_compl
            elif '<cvss:authentication>' in ln:
                vuln_auth = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_auth
                nvd_dict[vuln_id]['vuln_auth'] = vuln_auth
            elif '<cvss:confidentiality-impact>' in ln:
                vuln_confid = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_confid
                nvd_dict[vuln_id]['vuln_confid'] = vuln_confid
            elif '<cvss:integrity-impact>' in ln:
                vuln_integ = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_integ
                nvd_dict[vuln_id]['vuln_integ'] = vuln_integ
            elif '<cvss:availability-impact>' in ln:
                vuln_avail = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_avail
                nvd_dict[vuln_id]['vuln_avail'] = vuln_avail
            elif 'vuln:reference href=' in ln:
                vuln_link = re.search('href=.*\"', ln).group(0)[6:-1]
                # print '    --->%s'%vuln_link
                nvd_dict[vuln_id]['vuln_link'] = vuln_link
            elif '<vuln:summary>' in ln:
                vuln_summ = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_summ
                nvd_dict[vuln_id]['vuln_summ'] = vuln_summ
            elif '<vuln:source>' in ln:
                vuln_source = re.search('>.*<', ln).group(0)[1:-1]
                # print '    --->%s'%vuln_summ
                nvd_dict[vuln_id]['vuln_source'] = vuln_source
    return(nvd_dict)


Now run the defined function on the xml file extracted to the working directory.

In [3]:
nvd = pull_nvd('./nvdcve-2.0-2015.xml')

Loading NVD Dataset


Create a timeline for 2015 so far by count both the unweighted instances and the weighted totals given the CVE Score provided.

In [None]:
# Create timeline of 2015
nvd_timeline = [[],[]]
for cve in nvd.keys():
    if 'vuln_score' in nvd[cve].keys():
        weightedme = {'Method':'Weighted', 'Date':nvd[cve]['vuln_datetime'], 'Value':float(nvd[cve]['vuln_score'])}
        countme = {'Method':'Count', 'Date':nvd[cve]['vuln_datetime'], 'Value':1}
        if nvd[cve]['vuln_datetime'] not in [ x['Date'] for x in nvd_timeline[0] ]:
            nvd_timeline[0].append(weightedme)
            nvd_timeline[1].append(countme)
        else:
            for i, day in enumerate(nvd_timeline[0]):
                if nvd[cve]['vuln_datetime'] in day['Date'] and day['Method'] == 'Weighted':
                    nvd_timeline[0][i]['Value'] += float(nvd[cve]['vuln_score'])
                elif nvd[cve]['vuln_datetime'] in day['Date'] and day['Method'] == 'Count':
                    nvd_timeline[1][i]['Value'] += 1

# Export to json
with open('./nvd_timeline.json', 'wb') as f:
    json.dump(nvd_timeline, f)

Given the parsed dictionary, we create a dictionary of nodes and edges with the additional week number, day number, and threat score loaded. Then given the dictionary, export to JSON file.

In [None]:
# create Node-Edge JSON
nvd_json = {'nodes':[],'links':[]}
for cve in nvd.keys():
    if 'vuln_os' in nvd[cve].keys():
        append_vuln = {'name':cve, 'group':nvd[cve]['vuln_date'][2], 'week':nvd[cve]['vuln_date'][1], 'threat':float(nvd[cve]['vuln_score']), 'type':1}
        nvd_json['nodes'].append(append_vuln)
        for os in nvd[cve]['vuln_os']:
            append_plat = {'name':os, 'group':nvd[cve]['vuln_date'][2], 'week':nvd[cve]['vuln_date'][1], 'threat':0, 'type':0}
            if append_plat not in nvd_json['nodes']:
                nvd_json['nodes'].append(append_plat)
            nvd_json['links'].append({'source':cve, 'target':os, 'group':nvd[cve]['vuln_date'][2], 'week':nvd[cve]['vuln_date'][1], 'threat':float(nvd[cve]['vuln_score'])})

# Export to json
with open('./nvd.json', 'wb') as f:
    json.dump(nvd_json, f)


Now we build inputs for Gephi. Gephi requires an edge list and (optionally) a node list (if unconnected nodes exist). We create list arrays with header filled in. Then populate each row with either the edge or node. Then write both files to csv files in working directory.

In [None]:
# Build exact edge list
nvd_edge = [['source','target','value']]
for cve in nvd.keys():
    if 'vuln_score' in nvd[cve].keys():
        threat = nvd[cve]['vuln_score']
    else:
        threat = 0.0
    if 'vuln_os_plat' in nvd[cve].keys():
        cve_type = (nvd[cve]['vuln_vector'], nvd[cve]['vuln_compl'], nvd[cve]['vuln_auth'], nvd[cve]['vuln_confid'], nvd[cve]['vuln_integ'])
        for plat in nvd[cve]['vuln_os_plat']:
            appendme = [cve_type, plat, threat]
            if appendme not in nvd_edge:
                nvd_edge.append(appendme)

# # Build exact node list
nvd_node = [['id','value','type']]
for row in nvd_edge:
    append_vuln = [row[0],row[2],0]
    append_targ = [row[1],0.0,1]
    if append_vuln not in nvd_node:
        nvd_node.append(append_vuln)
    if append_targ not in nvd_node:
        nvd_node.append(append_targ)

# Export to CSV for Gephi
with open('./nvd_edge.csv', 'wb') as csvfile:
    writeme = csv.writer(csvfile, delimiter=',')
    writeme.writerows(nvd_edge)

with open('./nvd_node.csv', 'wb') as csvfile:
    writeme = csv.writer(csvfile, delimiter=',')
    writeme.writerows(nvd_node)