This script performs clustering on the scraped data of Java CVE descriptions done [here](https://github.com/Semiu/java-codesecurity/blob/main/data-scraping-curating/Java%20CVEs%20from%20the%20National%20Vulnerabilities%20Database.ipynb).

It aims to identify core categories of Java CVEs as found in the National Vulnerabilities Database.

In [37]:
#Import the libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn import metrics
from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt


In [38]:
#Function to extract the text (CVE descriptions)
def extract_text_corpus():
    
    cve_text_corpus = []
    
    java_cve_text = pd.read_csv("C:\\Users\\Semiu\\Documents\\java-codesecurity\\JavaVulData\\JavaVulData.csv")
    
    for cve in java_cve_text['vulnerability_description']:
        cve_text_corpus.append(cve)
        
    return cve_text_corpus

In [39]:
#Initialize a vectorizer from the TFIDF
vectorizer = TfidfVectorizer(stop_words='english')

#The vectorized corpus takes the extract_text_corpus as an argument
vectorized_corpus = vectorizer.fit_transform(extract_text_corpus())

In [40]:
vectorized_corpus = vectorized_corpus.reshape(-1,1)

In [41]:
#The K-Means algorithm for clustering
#no_of_clusters = 10 #Arbitrarily chosen to test

distortions = []
inertias = []
mapping1 = {}
mapping2 = {}

K = range(1, 10)
 
for k in K:
    
    model = KMeans(n_clusters = k, init = 'k-means++', max_iter = 100, n_init = 1)
    
    #model = KMeans(n_clusters=k).fit(vectorized_corpus)
    
    #Fit the model with the viectorized corpus
    model.fit(vectorized_corpus)
    
    distortions.append(sum(np.min(cdist(vectorized_corpus, model.cluster_centers_,'euclidean'), axis=1)) / vectorized_corpus.shape[0])
    
    inertias.append(model.inertia_)
 
    mapping1[k] = sum(np.min(cdist(vectorized_corpus, model.cluster_centers_,'euclidean'), axis=1)) / vectorized_corpus.shape[0]
    
    mapping2[k] = model.inertia_

ValueError: XA must be a 2-dimensional array.

In [None]:

for key, val in mapping1.items():
    print(f'{key} : {val}')


In [None]:

plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()


In [None]:

for key, val in mapping2.items():
    print(f'{key} : {val}')


In [None]:

plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()


In [17]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [18]:
for i in range(no_of_clusters):
    print("Cluster %d:" %i)
    
    for ind in order_centroids[i, :10]:
        print('%s' %terms[ind])

Cluster 0:
jre
jdk
earlier
update
sdk
sun
environment
runtime
java
aka
Cluster 1:
oracle
affect
se
vectors
unspecified
unknown
related
confidentiality
vulnerability
integrity
Cluster 2:
jenkins
xml
main
src
entity
external
plugins
java
exists
xxe
Cluster 3:
service
denial
cause
crash
sap
server
java
attackers
remote
allows
Cluster 4:
se
java
cvss
embedded
sandboxed
vulnerability
attacks
code
successful
access
Cluster 5:
xss
cross
scripting
site
script
html
inject
web
arbitrary
sun
Cluster 6:
cve
2014
7u51
se
vulnerability
affect
different
oracle
vectors
related
Cluster 7:
java
server
arbitrary
remote
allows
attackers
execute
code
application
sun
Cluster 8:
earlier
update
java
se
environment
runtime
related
jre
affect
oracle
Cluster 9:
apache
serialized
code
object
arbitrary
execute
deserialization
crafted
execution
java
