In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

In [15]:
# loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Hamoye Internship/Team_algorithm/CVSS_metrics_prediction/cve_merged.csv')
df.shape

(241979, 18)

In [9]:
df.head()

Unnamed: 0,cve_id,mod_date,pub_date,cvss,cwe_code,cwe_name,summary,access_authentication,access_complexity,access_vector,impact_availability,impact_confidentiality,impact_integrity,vendor,vulnerable_product,Pub_Year,Mod_Year,Qualitative_cvss
0,CVE-2019-2211,2019-11-14 21:36:00,2019-11-13 18:15:00,7.8,89,Improper Neutralization of Special Elements u...,In createProjectionMapForQuery of TvProvider.j...,NONE,LOW,NETWORK,NONE,COMPLETE,NONE,google,android,2019,2019,High
1,CVE-2019-2212,2019-11-14 21:30:00,2019-11-13 18:15:00,4.9,200,Information Exposure,"In poisson_distribution of random, there is an...",NONE,LOW,LOCAL,NONE,COMPLETE,NONE,google,android,2019,2019,Medium
2,CVE-2019-2213,2019-11-14 21:24:00,2019-11-13 18:15:00,6.9,416,Use After Free,"In binder_free_transaction of binder.c, there ...",NONE,MEDIUM,LOCAL,COMPLETE,COMPLETE,COMPLETE,google,android,2019,2019,Medium
3,CVE-2019-2214,2019-11-14 21:19:00,2019-11-13 18:15:00,7.2,269,Improper Privilege Management,"In binder_transaction of binder.c, there is a ...",NONE,LOW,LOCAL,COMPLETE,COMPLETE,COMPLETE,google,android,2019,2019,High
4,CVE-2019-18793,2019-11-14 21:14:00,2019-11-13 20:15:00,4.3,79,Improper Neutralization of Input During Web P...,Parallels Plesk Panel 9.5 allows XSS in target...,NONE,MEDIUM,NETWORK,NONE,NONE,PARTIAL,parallels,parallels_plesk_panel,2019,2019,Medium


In [16]:
df.drop_duplicates(inplace=True,keep=False)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241979 entries, 0 to 241978
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   cve_id                  241979 non-null  object 
 1   mod_date                241979 non-null  object 
 2   pub_date                241979 non-null  object 
 3   cvss                    241979 non-null  float64
 4   cwe_code                241979 non-null  int64  
 5   cwe_name                241979 non-null  object 
 6   summary                 241979 non-null  object 
 7   access_authentication   241979 non-null  object 
 8   access_complexity       241979 non-null  object 
 9   access_vector           241979 non-null  object 
 10  impact_availability     241979 non-null  object 
 11  impact_confidentiality  241979 non-null  object 
 12  impact_integrity        241979 non-null  object 
 13  vendor                  241979 non-null  object 
 14  vulnerable_product  

In [17]:
# splitting the datasets into df_train and df_test set
df_train, df_test = train_test_split(df, random_state=0, test_size=0.25)

Feature Engineering

In [18]:
# one-hot encode the df_train and extracting the target values
X = pd.get_dummies(data=df_train[['access_authentication', 'access_vector', 'access_complexity', 
                        'impact_availability', 'impact_confidentiality', 'impact_integrity']], 
                        prefix=['access_authentication', 'access_vector', 'access_complexity', 'impact_availability',
                                'impact_confidentiality', 'impact_integrity'],)
y = df_train.cvss.values

In [19]:
print(X.shape)
print(y.shape)

(181484, 18)
(181484,)


### Building the Model
To build the model, split the df_train dataset into X_train and X_val set, then train the model(random forest regressor) on the X_train dataset and validate it predictions on the X_val dataset.

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.35)


In [21]:
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(117964, 18) (117964,)
(63520, 18) (63520,)


In [22]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=1)
model.fit(x_train, y_train)

RandomForestRegressor(random_state=1)

In [47]:
x_train.columns

Index(['access_authentication_MULTIPLE', 'access_authentication_NONE',
       'access_authentication_SINGLE', 'access_vector_ADJACENT_NETWORK',
       'access_vector_LOCAL', 'access_vector_NETWORK',
       'access_complexity_HIGH', 'access_complexity_LOW',
       'access_complexity_MEDIUM', 'impact_availability_COMPLETE',
       'impact_availability_NONE', 'impact_availability_PARTIAL',
       'impact_confidentiality_COMPLETE', 'impact_confidentiality_NONE',
       'impact_confidentiality_PARTIAL', 'impact_integrity_COMPLETE',
       'impact_integrity_NONE', 'impact_integrity_PARTIAL'],
      dtype='object')

In [23]:
pred = model.predict(x_val)

In [24]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_val, pred))
round(rmse, 3)

0.021

### Testing the Model on Unseen data(df_test)

In [25]:
def transform_data(data, frame, single):
    ''' One_hot encode the data dataset, and use frame to align the encoded data to have the required features
    data = the dataset that is to be encoded
    frame = the dataset that contains the required features, and which is used to align the encoded data
    n = specifies the number of records present. default is 1 i.e for single data'''
    
    if single:
        x = pd.DataFrame(data, index=range(1))
    else:
        x = pd.DataFrame(data)

    x = pd.get_dummies(data=x[['access_authentication', 'access_vector', 'access_complexity', 
                        'impact_availability', 'impact_confidentiality', 'impact_integrity']], 
                        prefix=['access_authentication', 'access_vector', 'access_complexity', 'impact_availability',
                                'impact_confidentiality', 'impact_integrity'],)
    frame , x = frame.align(x, join='left', axis=1, fill_value=0)
    return x

def predict(data, single=True):
    data = transform_data(data, X, single=single)

    prediction = model.predict(data)
    return prediction

In [26]:
X_test = transform_data(df_test.to_dict(), X, single=False)

y_test = df_test.cvss.values

In [27]:
print(X_test.shape, y_test.shape)

(60495, 18) (60495,)


In [28]:
test_pred = model.predict(X_test)

In [31]:
from sklearn.metrics import mean_squared_error
print("mse:",mean_squared_error(y_test, test_pred))
rmse = np.sqrt(mean_squared_error(y_test, test_pred))
print("rmse:",round(rmse, 3))

mse: 0.000541452417724019
rmse: 0.023


### Testing on single data

In [36]:
selected_columns = df.columns[5:13]
df = df.loc[:,selected_columns]
df.head()

Unnamed: 0,cwe_name,summary,access_authentication,access_complexity,access_vector,impact_availability,impact_confidentiality,impact_integrity
0,Improper Neutralization of Special Elements u...,In createProjectionMapForQuery of TvProvider.j...,NONE,LOW,NETWORK,NONE,COMPLETE,NONE
1,Information Exposure,"In poisson_distribution of random, there is an...",NONE,LOW,LOCAL,NONE,COMPLETE,NONE
2,Use After Free,"In binder_free_transaction of binder.c, there ...",NONE,MEDIUM,LOCAL,COMPLETE,COMPLETE,COMPLETE
3,Improper Privilege Management,"In binder_transaction of binder.c, there is a ...",NONE,LOW,LOCAL,COMPLETE,COMPLETE,COMPLETE
4,Improper Neutralization of Input During Web P...,Parallels Plesk Panel 9.5 allows XSS in target...,NONE,MEDIUM,NETWORK,NONE,NONE,PARTIAL


In [38]:
metrics_1 = df.iloc[49898,2:].to_dict()
metrics_1

{'access_authentication': 'NONE',
 'access_complexity': 'LOW',
 'access_vector': 'NETWORK',
 'impact_availability': 'PARTIAL',
 'impact_confidentiality': 'PARTIAL',
 'impact_integrity': 'PARTIAL'}

In [39]:
predict(metrics_1)

array([7.5])

In [42]:
metrics_2 = {'access_authentication': 'MULTIPLE',
 'access_complexity': 'MEDIUM',
 'access_vector': 'NETWORK',
 'impact_availability': 'PARTIAL',
 'impact_confidentiality': 'COMPLETE',
 'impact_integrity': 'NONE'}


In [44]:
predict(metrics_2)

array([5.242])

In [43]:
df.iloc[49898,3]

'LOW'

Saving Data

In [40]:
import pickle
import os
#save model
os.chdir("/content/drive/MyDrive/Hamoye Internship/Team_algorithm/CVSS_Prediction")
with open('cvss_model.pkl', 'wb') as handle:
    pickle.dump(model, handle)

In [41]:
#load model
with open('/content/drive/MyDrive/Hamoye Internship/Team_algorithm/CVSS_Prediction/cvss_model.pkl', 'rb') as handle:
    model = pickle.load(handle)

In [45]:
predict(metrics_2)

array([5.242])