In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# loading the dataset
df = pd.read_csv('cve_clean')
df.shape

(88776, 13)

In [3]:
df.head()

Unnamed: 0,cve_id,mod_date,pub_date,cvss,cwe_code,cwe_name,summary,access_authentication,access_complexity,access_vector,impact_availability,impact_confidentiality,impact_integrity
0,CVE-2019-2211,2019-11-14 21:36:00,2019-11-13 18:15:00,7.8,89,Improper Neutralization of Special Elements u...,In createProjectionMapForQuery of TvProvider.j...,NONE,LOW,NETWORK,NONE,COMPLETE,NONE
1,CVE-2019-2212,2019-11-14 21:30:00,2019-11-13 18:15:00,4.9,200,Information Exposure,"In poisson_distribution of random, there is an...",NONE,LOW,LOCAL,NONE,COMPLETE,NONE
2,CVE-2019-2213,2019-11-14 21:24:00,2019-11-13 18:15:00,6.9,416,Use After Free,"In binder_free_transaction of binder.c, there ...",NONE,MEDIUM,LOCAL,COMPLETE,COMPLETE,COMPLETE
3,CVE-2019-2214,2019-11-14 21:19:00,2019-11-13 18:15:00,7.2,269,Improper Privilege Management,"In binder_transaction of binder.c, there is a ...",NONE,LOW,LOCAL,COMPLETE,COMPLETE,COMPLETE
4,CVE-2019-18793,2019-11-14 21:14:00,2019-11-13 20:15:00,4.3,79,Improper Neutralization of Input During Web P...,Parallels Plesk Panel 9.5 allows XSS in target...,NONE,MEDIUM,NETWORK,NONE,NONE,PARTIAL


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88776 entries, 0 to 88775
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cve_id                  88776 non-null  object 
 1   mod_date                88776 non-null  object 
 2   pub_date                88776 non-null  object 
 3   cvss                    88776 non-null  float64
 4   cwe_code                88776 non-null  int64  
 5   cwe_name                88776 non-null  object 
 6   summary                 88776 non-null  object 
 7   access_authentication   88776 non-null  object 
 8   access_complexity       88776 non-null  object 
 9   access_vector           88776 non-null  object 
 10  impact_availability     88776 non-null  object 
 11  impact_confidentiality  88776 non-null  object 
 12  impact_integrity        88776 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 8.8+ MB


In [5]:
# splitting the datasets into df_train and df_test set
df_train, df_test = train_test_split(df, random_state=0, test_size=0.25)

Feature Engineering

In [6]:
# one-hot encode the df_train and extracting the target values
X = pd.get_dummies(data=df_train[['access_authentication', 'access_vector', 'access_complexity', 
                        'impact_availability', 'impact_confidentiality', 'impact_integrity']], 
                        prefix=['access_authentication', 'access_vector', 'access_complexity', 'impact_availability',
                                'impact_confidentiality', 'impact_integrity'],)
y = df_train.cvss.values

In [7]:
print(X.shape)
print(y.shape)

(66582, 18)
(66582,)


### Building the Model
To build the model, split the df_train dataset into X_train and X_val set, then train the model(random forest regressor) on the X_train dataset and validate it predictions on the X_val dataset.

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.35)


In [9]:
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(43278, 18) (43278,)
(23304, 18) (23304,)


In [10]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=1)
model.fit(x_train, y_train)

RandomForestRegressor(random_state=1)

In [11]:
pred = model.predict(x_val)

In [12]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_val, pred))
round(rmse, 3)

0.03

### Testing the Model on Unseen data(df_test)

In [13]:
def transform_data(data, frame, single):
    ''' One_hot encode the data dataset, and use frame to align the encoded data to have the required features
    data = the dataset that is to be encoded
    frame = the dataset that contains the required features, and which is used to align the encoded data
    n = specifies the number of records present. default is 1 i.e for single data'''
    
    if single:
        x = pd.DataFrame(data, index=range(1))
    else:
        x = pd.DataFrame(data)

    x = pd.get_dummies(data=x[['access_authentication', 'access_vector', 'access_complexity', 
                        'impact_availability', 'impact_confidentiality', 'impact_integrity']], 
                        prefix=['access_authentication', 'access_vector', 'access_complexity', 'impact_availability',
                                'impact_confidentiality', 'impact_integrity'],)
    frame , x = frame.align(x, join='left', axis=1, fill_value=0)
    return x

def predict(data, single=True):
    data = transform_data(data, X, single=single)

    prediction = model.predict(data)
    return prediction

In [14]:
X_test = transform_data(df_test.to_dict(), X, single=False)

y_test = df_test.cvss.values

In [15]:
print(X_test.shape, y_test.shape)

(22194, 18) (22194,)


In [16]:
test_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, test_pred))
round(rmse, 3)

0.044

### Testing on single data

In [18]:
metrics_1 = df.iloc[49898,7:].to_dict()
metrics_1

{'access_authentication': 'NONE',
 'access_complexity': 'LOW',
 'access_vector': 'NETWORK',
 'impact_availability': 'PARTIAL',
 'impact_confidentiality': 'NONE',
 'impact_integrity': 'NONE'}

In [19]:
predict(metrics_1)

array([5.])

In [20]:
metrics_2 = {'access_authentication': 'MULTIPLE',
 'access_complexity': 'MEDIUM',
 'access_vector': 'NETWORK',
 'impact_availability': 'PARTIAL',
 'impact_confidentiality': 'COMPLETE',
 'impact_integrity': 'NONE'}


In [21]:
predict(metrics_2)

array([6.687])

In [22]:
df.iloc[49898,3]

5.0

Saving Data

In [23]:
import pickle

with open('model.bin', 'wb') as f_out:
    pickle.dump((X, model), f_out)