In [1]:
import os
import pandas as pd
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import plotly.express as px
import json

os.chdir('../')


In [2]:

from Utils.SparkSessionManager import SparkSessionManager
from Service.DataReader import DataReader
from Service.VMAnalyzer import VMAnalyzer

In [3]:
spark = SparkSessionManager.create_session()
dt_reader = DataReader(spark)
with open('settings.json', 'r') as file:
    data = json.load(file)

local_settings = data['Local']
filepath = local_settings['filepath']
input_file = local_settings['input']

csv_file, vmSchema = dt_reader.read_data(filepath, input_file)
dt_analyzer = VMAnalyzer(spark, csv_file, vmSchema)


In [4]:
dataFrame = dt_analyzer.get_df()
processed_df = dt_analyzer.pre_processing_pipeline()

In [5]:
columns = ["baseScore", "impactScore", "exploitabilityScore"]
dt_analyzer.calculate_range(dataFrame, columns)

+-------------+-------------+---------------+---------------+-----------------------+-----------------------+
|min_baseScore|max_baseScore|min_impactScore|max_impactScore|min_exploitabilityScore|max_exploitabilityScore|
+-------------+-------------+---------------+---------------+-----------------------+-----------------------+
|          2.1|         10.0|            1.4|            6.0|                    0.2|                    3.9|
+-------------+-------------+---------------+---------------+-----------------------+-----------------------+



In [6]:
mean_df_base_score = dt_analyzer.calculate_mean_category(dataFrame, "year", "baseScore", "Year", "Base Score", True)
options = {
    'xAxisLabel': "<span style='letter-spacing: 1.3px;'>Year</span>",
    'yAxisLabel': "<span style='letter-spacing: 1.3px;'>Base Score</span>",
    'width': 700,
    'height': 700,
    'title': "Yearly Average Base Score",
    'format': "",
    'font_size': 16,
    'font_color': "black",
    'value_color': "black",
    'value_size': 12
}
dt_analyzer.show_line_chart(mean_df_base_score, "Year", "Base Score", options)

+----+------------------+
|Year|        Base Score|
+----+------------------+
|2011|               7.5|
|2012| 5.925000190734863|
|2013| 6.920000171661377|
|2014|               6.8|
|2015|7.6238096214476085|
|2016| 7.256000061035156|
|2017| 7.692817796001118|
|2018|7.4189874311036705|
|2019| 7.172727346420288|
|2020|   7.0690729423864|
|2021| 7.066640901047653|
+----+------------------+



In [7]:

mode_df_base_score = dt_analyzer.calculate_mode_category(dataFrame, "attackVector", "baseScore", "Attack Vector", "Base Score")
mean_df_base_score = dt_analyzer.calculate_mean_category(dataFrame, "attackVector", "baseScore", "Attack Vector", "Base Score")
options = {
    'xAxisLabel': "<span style='letter-spacing: 1.3px;'>Attack Vector</span>",
    'yAxisLabel': "<span style='letter-spacing: 1.3px;'>Base Score</span>",
    'width': 700,
    'height': 700,
    'title': "Distribution of Mean and Mode by Attack Vector",
    'format': "",
    'font_size': 16,
    'font_color': "black",
    'value_color': "black",
    'value_size': 12
}
dt_analyzer.show_go_bar_chart_mean_mode(mode_df_base_score, mean_df_base_score, "Attack Vector", "Base Score", options)



+----------------+----------+
|   Attack Vector|Base Score|
+----------------+----------+
|           LOCAL|       7.8|
|         NETWORK|       7.5|
|        PHYSICAL|       6.8|
|ADJACENT NETWORK|       8.8|
+----------------+----------+

+----------------+------------------+
|   Attack Vector|        Base Score|
+----------------+------------------+
|           LOCAL|  6.64129519082938|
|         NETWORK| 7.353532108513199|
|        PHYSICAL| 5.635227303613316|
|ADJACENT NETWORK|7.1443182826042175|
+----------------+------------------+



In [8]:
options = {
    'xAxis': "<span style='letter-spacing: 1.3px;'>Exploitability Score</span>",
    'yAxis': "<span style='letter-spacing: 1.3px;'>Base Score</span>",
    'width': 700,
    'height': 700,
    'title': "Correlation: Base Score and Exploitability Score",
    'format': "",
    'font_size': 16,
    'font_color': "black",
}
dt_analyzer.show_correlation_scatter_plot(processed_df, "exploitabilityScore", "baseScore", options)

In [9]:
options = {
    'xAxisLabel': "<span style='letter-spacing: 1.3px;'>Attack Vector</span>",
    'yAxisLabel': "<span style='letter-spacing: 1.3px;'>Base Severity Count</span>",
    'width': 700,
    'height': 1000,
    'title': "Base Severity Count by Attack Vector",
    'format': "",
    'font_size': 16,
    'font_color': "black",
    'value_color': "black",
    'value_size': 12
}
dt_analyzer.show_correlation_stacked_bar_chart(processed_df, "attackVector", "baseSeverity", options)

+----------------+------------+-----+
|    attackVector|baseSeverity|count|
+----------------+------------+-----+
|ADJACENT_NETWORK|    CRITICAL|    1|
|         NETWORK|         LOW|   23|
|           LOCAL|        HIGH|  509|
|ADJACENT_NETWORK|        HIGH|   41|
|        PHYSICAL|         LOW|    9|
|           LOCAL|         LOW|   50|
|        PHYSICAL|      MEDIUM|   72|
|         NETWORK|        HIGH| 1079|
|         NETWORK|      MEDIUM| 1076|
|ADJACENT_NETWORK|         LOW|    2|
|         NETWORK|    CRITICAL|  540|
|           LOCAL|    CRITICAL|    1|
|           LOCAL|      MEDIUM|  382|
|        PHYSICAL|        HIGH|    7|
|ADJACENT_NETWORK|      MEDIUM|   44|
+----------------+------------+-----+



In [10]:
options = {
    'xAxisLabel': "<span style='letter-spacing: 1.3px;'>Exploitability Score</span>",
    'yAxisLabel': "<span style='letter-spacing: 1.3px;'>Base Score</span>",
    'width': 700,
    'height': 600,
    'title': "Base Metrics Data and Impact Score Strength",
    'format': "",
    'font_size': 16,
    'font_color': "black",
    'value_color': "black",
    'value_size': 12,
     'size_col': "impactScore",
     'color_col': "attackVector",
     'key_label': "Attack Vector"
}
dt_analyzer.show_bubble_chart(
   processed_df, 
  "exploitabilityScore",
  "baseScore", 
   options
  )

In [11]:
dt_analyzer.check_correlation(processed_df, "ConfidentialityImpactNumeric", "IntegrityImpactNumeric")
column_display_info = [
    {"column_name": "ConfidentialityImpactNumeric", "display_name": "Confidentiality Impact"},
    {"column_name": "IntegrityImpactNumeric", "display_name": "Integrity Impact"},
    {"column_name": "AvailabilityImpactNumeric", "display_name": "Availability Impact"}
]
correlation_df = dt_analyzer.calculate_correlation(processed_df, column_display_info)
options = {
    'width': 700,
    'height': 600,
    'title': "Correlation Matrix: Impact Score Sub-Metrics",
    'format': "",
    'font_size': 16,
    'font_color': "black",
    'value_color': "black",
    'value_size': 13,
}
dt_analyzer.show_heatmap(correlation_df, options)

0.7572366837482335
Correlation matrix:
                        Confidentiality Impact  Integrity Impact  \
Confidentiality Impact                1.000000          0.757237   
Integrity Impact                      0.757237          1.000000   
Availability Impact                   0.497927          0.606729   

                        Availability Impact  
Confidentiality Impact             0.497927  
Integrity Impact                   0.606729  
Availability Impact                1.000000  


In [12]:
columns_clustering = ['baseScore', 'exploitabilityScore', 'impactScore']
dt_analyzer.clustering_pipeline(processed_df, columns_clustering)

Silhouette Score: 0.5954636946770762
Silhouette Score: 0.6636337328878196
Silhouette Score: 0.6065886446507088
Silhouette Score: 0.671160517874625
Silhouette Score: 0.6679948308040315
Silhouette Score: 0.7607532089871422
   baseScore  exploitabilityScore  impactScore
0   0.595464             0.595464     0.595464
1   0.663634             0.663634     0.663634
2   0.606589             0.606589     0.606589
3   0.671161             0.671161     0.671161
4   0.667995             0.667995     0.667995
5   0.760753             0.760753     0.760753
