# Project Review 3 - Multiple Regression of  CVSS v3 Metrics

### Goals
To predict how accurate **CVSS v3** metrics are at determining vulnerability base scores.

In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import glob
import random
from pandas.io.json import json_normalize
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#### First I'm looking at one .json file to determine which which columns are needed.

In [9]:
filename = '../../../ipynb/data/nvd/nvdcve-1.1-2017.json'
df = pd.read_json(filename)
df = json_normalize(df['CVE_Items'])
df.head()

Unnamed: 0,publishedDate,lastModifiedDate,cve.data_type,cve.data_format,cve.data_version,cve.CVE_data_meta.ID,cve.CVE_data_meta.ASSIGNER,cve.problemtype.problemtype_data,cve.references.reference_data,cve.description.description_data,...,impact.baseMetricV2.cvssV2.availabilityImpact,impact.baseMetricV2.cvssV2.baseScore,impact.baseMetricV2.severity,impact.baseMetricV2.exploitabilityScore,impact.baseMetricV2.impactScore,impact.baseMetricV2.obtainAllPrivilege,impact.baseMetricV2.obtainUserPrivilege,impact.baseMetricV2.obtainOtherPrivilege,impact.baseMetricV2.userInteractionRequired,impact.baseMetricV2.acInsufInfo
0,2017-03-17T00:59Z,2019-10-03T00:03Z,CVE,MITRE,4.0,CVE-2017-0001,cve@mitre.org,"[{'description': [{'lang': 'en', 'value': 'NVD...",[{'url': 'http://www.securityfocus.com/bid/960...,"[{'lang': 'en', 'value': 'The Graphics Device ...",...,COMPLETE,7.2,HIGH,3.9,10.0,False,False,False,False,
1,2017-01-10T21:59Z,2019-10-03T00:03Z,CVE,MITRE,4.0,CVE-2017-0002,cve@mitre.org,"[{'description': [{'lang': 'en', 'value': 'NVD...",[{'url': 'http://www.securityfocus.com/bid/952...,"[{'lang': 'en', 'value': 'Microsoft Edge allow...",...,PARTIAL,6.8,MEDIUM,8.6,6.4,False,False,False,True,
2,2017-01-10T21:59Z,2018-10-12T22:15Z,CVE,MITRE,4.0,CVE-2017-0003,cve@mitre.org,"[{'description': [{'lang': 'en', 'value': 'CWE...",[{'url': 'http://fortiguard.com/advisory/FG-VD...,"[{'lang': 'en', 'value': 'Microsoft Word 2016 ...",...,COMPLETE,9.3,HIGH,8.6,10.0,False,False,False,True,
3,2017-01-10T21:59Z,2018-10-12T22:15Z,CVE,MITRE,4.0,CVE-2017-0004,cve@mitre.org,"[{'description': [{'lang': 'en', 'value': 'CWE...",[{'url': 'http://www.securityfocus.com/bid/953...,"[{'lang': 'en', 'value': 'The Local Security A...",...,COMPLETE,7.8,HIGH,10.0,6.9,False,False,False,False,True
4,2017-03-17T00:59Z,2019-10-03T00:03Z,CVE,MITRE,4.0,CVE-2017-0005,cve@mitre.org,"[{'description': [{'lang': 'en', 'value': 'CWE...",[{'url': 'http://www.securityfocus.com/bid/960...,"[{'lang': 'en', 'value': 'The Graphics Device ...",...,COMPLETE,6.9,MEDIUM,3.4,10.0,False,False,False,False,


In [10]:
for col in df.columns: 
    print(col)

publishedDate
lastModifiedDate
cve.data_type
cve.data_format
cve.data_version
cve.CVE_data_meta.ID
cve.CVE_data_meta.ASSIGNER
cve.problemtype.problemtype_data
cve.references.reference_data
cve.description.description_data
configurations.CVE_data_version
configurations.nodes
impact.baseMetricV3.cvssV3.version
impact.baseMetricV3.cvssV3.vectorString
impact.baseMetricV3.cvssV3.attackVector
impact.baseMetricV3.cvssV3.attackComplexity
impact.baseMetricV3.cvssV3.privilegesRequired
impact.baseMetricV3.cvssV3.userInteraction
impact.baseMetricV3.cvssV3.scope
impact.baseMetricV3.cvssV3.confidentialityImpact
impact.baseMetricV3.cvssV3.integrityImpact
impact.baseMetricV3.cvssV3.availabilityImpact
impact.baseMetricV3.cvssV3.baseScore
impact.baseMetricV3.cvssV3.baseSeverity
impact.baseMetricV3.exploitabilityScore
impact.baseMetricV3.impactScore
impact.baseMetricV2.cvssV2.version
impact.baseMetricV2.cvssV2.vectorString
impact.baseMetricV2.cvssV2.accessVector
impact.baseMetricV2.cvssV2.accessComplexit

#### Only the columns with .baseMetricV3... are needed.

In [11]:
df = df.iloc[:,[14,15,16,17,18,19,20,21,22,23,24,25]]
df.columns = ['attackVector',
              'attackComplexity',
              'privilegesRequired',
              'userInteraction',
              'scope', 
              'confidentialityImpact',
              'integrityImpact', 
              'availabilityImpact',
              'baseScore', 
              'baseSeverity',
              'exploitabilityScore',
              'impactScore']
df.head()

Unnamed: 0,attackVector,attackComplexity,privilegesRequired,userInteraction,scope,confidentialityImpact,integrityImpact,availabilityImpact,baseScore,baseSeverity,exploitabilityScore,impactScore
0,LOCAL,LOW,LOW,NONE,UNCHANGED,HIGH,HIGH,HIGH,7.8,HIGH,1.8,5.9
1,NETWORK,LOW,NONE,REQUIRED,UNCHANGED,HIGH,HIGH,HIGH,8.8,HIGH,2.8,5.9
2,LOCAL,LOW,NONE,REQUIRED,UNCHANGED,HIGH,HIGH,HIGH,7.8,HIGH,1.8,5.9
3,NETWORK,LOW,NONE,NONE,UNCHANGED,NONE,NONE,HIGH,7.5,HIGH,3.9,3.6
4,LOCAL,HIGH,LOW,NONE,UNCHANGED,HIGH,HIGH,HIGH,7.0,HIGH,1.0,5.9


In [12]:
del df

#### Now we know which columns are needed, we can read in all the data. These functions read in the .json files and encode the columns.

In [13]:
def import_encode_nvd():
    path = '../../../ipynb/data/nvd/'
    all_nvds = glob.glob(path+'*.json')
    l = []
    
    for filename in all_nvds:
        nvd_df = pd.read_json(filename)
        nvd_df = json_normalize(nvd_df['CVE_Items'])
        
        features = nvd_df.iloc[:, [14,15,16,17,18,19,20,21,22,23,24,25]]
        features.columns = ['attackVector', 
                            'attackComplexity', 
                            'privilegesRequired', 
                            'userInteraction', 
                            'scope', 
                            'confidentialityImpact', 
                            'integrityImpact', 
                            'availabilityImpact', 
                            'baseScore', 
                            'baseSeverity', 
                            'exploitabilityScore', 
                            'impactScore']
        features = features.dropna()
        features = encode_col(features, features.columns)
        l.append(features)
    
    return pd.concat(l, ignore_index=True, sort=False)


labelEncoder = LabelEncoder()

def encode_col(df, columns):
    for col in columns:
        if col != 'baseScore':
            labelEncoder.fit(df[col])
            df[col] = labelEncoder.transform(df[col])
        
    return df

In [14]:
v3_metrics = import_encode_nvd()
v3_metrics.head()

Unnamed: 0,attackVector,attackComplexity,privilegesRequired,userInteraction,scope,confidentialityImpact,integrityImpact,availabilityImpact,baseScore,baseSeverity,exploitabilityScore,impactScore
0,1,1,1,0,1,0,0,0,7.8,1,15,13
1,2,1,2,1,1,0,0,0,8.8,1,21,13
2,1,1,2,1,1,0,0,0,7.8,1,15,13
3,2,1,2,0,1,2,2,0,7.5,1,23,4
4,1,0,1,0,1,0,0,0,7.0,1,7,13


#### Split the data:
* 70% as training data.
* 30% as testing data.

In [15]:
X = v3_metrics.drop('baseScore', axis=1)
y = v3_metrics[['baseScore']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

#### Select a Linear Regression estimator and train it using the training data.

In [16]:
model = LinearRegression();
model.fit(X_train, y_train);

#### Evaluating the accuracy of the model.

In [17]:
y_pred = model.predict(X_test)

# root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# r-squared
r2 = r2_score(y_test, y_pred)

* The root mean squared error measures the average distance between the predicted values and the actual values. The smaller the value, the closer the fit.
* The r-squared value is a mesure of how close the data points are to the fitted line.

In [18]:
print('root mean squared error =', rmse)
print('r-squared =', r2)

root mean squared error = 1.1319711302169158
r-squared = 0.7041150221249005


* The root mean squared error tells us that, with the given metrics, the model can (on average) predict the base score within about plus/minus 1.3 units of the actual base score.
* About 70.6% of the data fits the regression model.

#### Let's use the model to make some predictions.

In [19]:
# lets make 15 predictions and see how close the model is
for i in range(15):
    index = random.randint(0,X.shape[0])
    metrics = [X.iloc[index].values]
    pred = model.predict(metrics)[0][0]
    actual = y.iloc[index].values[0]
    
    print('Predicted:', pred, '\tActual:', actual)

Predicted: 9.347999714810149 	Actual: 10.0
Predicted: 4.530681263600278 	Actual: 5.3
Predicted: 8.03353838645325 	Actual: 8.6
Predicted: 7.049717542386005 	Actual: 6.5
Predicted: 7.142002353743116 	Actual: 7.5
Predicted: 11.318849431298947 	Actual: 10.0
Predicted: 6.747964115071224 	Actual: 5.4
Predicted: 5.802346874418686 	Actual: 6.1
Predicted: 5.034869800051405 	Actual: 5.5
Predicted: 8.59598784851952 	Actual: 8.6
Predicted: 10.616338885555063 	Actual: 10.0
Predicted: 8.75117552757007 	Actual: 8.6
Predicted: 6.844035113941137 	Actual: 6.7
Predicted: 8.439135295908507 	Actual: 10.0
Predicted: 2.934686005470011 	Actual: 4.7


In [20]:
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              baseScore   R-squared:                       0.705
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                 2.511e+04
Date:                Tue, 03 Dec 2019   Prob (F-statistic):               0.00
Time:                        02:54:53   Log-Likelihood:            -1.7835e+05
No. Observations:              115818   AIC:                         3.567e+05
Df Residuals:                  115806   BIC:                         3.568e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     6.52

  return ptp(axis=axis, out=out, **kwargs)
