In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pickle as pkl
import pandas as pd
import re
from datastore import DataStore
import numpy as np

In [139]:
def loadPickle(file_path):
    with open(file_path, 'rb') as file:
        data = pkl.load(file)
    return data

output = 'output_data'
features = f'{output}_features.pickle'
votes = f'{output}_votes.pickle'

data_features = loadPickle(features)
data_votes = loadPickle(votes)

df_features = pd.DataFrame(data_features)
df_votes = pd.DataFrame(data_votes)

ds = DataStore()
ds.load()
cve_dataset = ds.get_data()

In [140]:
def get_cvelist(text):
    pattern = r"^\*\*CVEs\*\*: (.*)$"
    match = re.search(pattern, text, re.MULTILINE)
    if match:
        cve_list = match.group(1).split(', ')
        if cve_list == ['']:
            return None
        else:   
            return list(set(cve_list))
    else:
        return None

def num_vulns(cve_list):
    if cve_list is None:
        return 0
    else:
        return len(cve_list)
    
def process_cve_list(cve_list, cve_dataset):
    if cve_list is None:
        return {
            'cvss_score': 0,
            'epss_score': 0,
            'epss_percentile': 0,
            'num_cwes': 0,
            'num_cpes': 0,
            'classification': None
        }
    
    cvss_score = 0
    epss_score = 0
    epss_percentile = 0
    num_cwes = 0
    num_cpes = 0
    classification = None

    for cve in cve_list:
        cve = cve.lower()
        impact = cve_dataset.get(cve, {}).get('impact', {})
        epss = cve_dataset.get(cve, {}).get('epss', {})
        cwes = cve_dataset.get(cve, {}).get('cwes', [])
        cpes = cve_dataset.get(cve, {}).get('cpes', [])
        classification_data = cve_dataset.get(cve, {}).get('classification', None)

        cvss_score_candidate = impact.get('cvss_score', 0)
        epss_score_candidate = epss.get('epss_score', 0)

        if cvss_score < cvss_score_candidate:
            cvss_score = cvss_score_candidate
            
        if epss_score < epss_score_candidate:
            epss_score = epss_score_candidate
            epss_percentile = epss.get('epss_percentile', 0)
            num_cwes = len(set(cwes))
            num_cpes = len(set(cpes))
            classification = classification_data

        remote_code_execution = classification.get('remote code execution', 0)
        privilege_escalation = classification.get('privilege escalation', 0)
        information_disclosure = classification.get('information disclosure', 0)
        denial_of_service = classification.get('denial of service', 0)
        buffer_overflow = classification.get('buffer overflow', 0)
        cross_site_request_forgery = classification.get('cross site request forgery', 0)
        sql_injection = classification.get('sql injection', 0)
        cross_site_scripting = classification.get('cross site scripting', 0)

    return {
        'cvss_score': cvss_score,
        'epss_score': epss_score,
        'epss_percentile': epss_percentile,
        'num_cwes': num_cwes,
        'num_cpes': num_cpes,
        'remote_code_execution': remote_code_execution,
        'privilege_escalation': privilege_escalation,
        'information_disclosure': information_disclosure,
        'denial_of_service': denial_of_service,
        'buffer_overflow': buffer_overflow,
        'cross_site_request_forgery': cross_site_request_forgery,
        'sql_injection': sql_injection,
        'cross_site_scripting': cross_site_scripting
    }

def merge_features(df, cve_dataset):
    df_updates = df['cve_list'].apply(lambda cves: pd.Series(process_cve_list(cves, cve_dataset)))
    
    df[['cvss_score', 'epss_score', 'epss_percentile', 'num_cwes', 'num_cpes', 'remote_code_execution', 'privilege_escalation', 'information_disclosure', 
     'denial_of_service', 'buffer_overflow', 'cross_site_request_forgery', 'sql_injection', 'cross_site_scripting']] = df_updates[
        ['cvss_score', 'epss_score', 'epss_percentile', 'num_cwes', 'num_cpes', 'remote_code_execution', 'privilege_escalation', 'information_disclosure', 
        'denial_of_service', 'buffer_overflow', 'cross_site_request_forgery', 'sql_injection', 'cross_site_scripting']
    ]

    return df

def merge_tables(df_features, df_votes):
    df = pd.merge(df_features, df_votes, left_on='id', right_on='id', how='left')
    
    return df

def get_cvelist(text):
    pattern = r"^\*\*CVEs\*\*: (.*)$"
    match = re.search(pattern, text, re.MULTILINE)
    if match:
        cve_list = match.group(1).split(', ')
        if cve_list == ['']:
            return None
        else:   
            return list(set(cve_list))
    else:
        return None

def set_mitigation(text):
    pattern = '\n'
    mitigation = text.split(pattern)[0]
    return mitigation

In [141]:
df_features["cve_list"] = df_features["description"].apply(get_cvelist)
df_features["num_vulns"] = df_features["cve_list"].apply(num_vulns)
df_features['mitigation'] = df_features['mitigation'].apply(set_mitigation)

In [142]:
df_features

Unnamed: 0,id,title,date,description,severity,vuln_id_from_tool,mitigation,epss_score,epss_percentile,cve,cve_list,num_vulns
0,13,Apache Hadoop 'Secure Mode' Disabled_150.164.2...,2025-04-03,**Name**: Apache Hadoop 'Secure Mode' Disabled...,High,1.3.6.1.4.1.25623.1.0.108173,Mitigation,,,,,0
1,20,IPMI 'No Auth' Access Mode Enabled (IPMI Proto...,2025-04-03,**Name**: IPMI 'No Auth' Access Mode Enabled (...,High,1.3.6.1.4.1.25623.1.0.103837,Workaround,,,,,0
2,5,IPMI 'No Auth' Access Mode Enabled (IPMI Proto...,2025-04-03,**Name**: IPMI 'No Auth' Access Mode Enabled (...,High,1.3.6.1.4.1.25623.1.0.103837,Workaround,,,,,0
3,25,IPMI Cipher Suite 0 (Cipher Zero) Authenticati...,2025-04-03,**Name**: IPMI Cipher Suite 0 (Cipher Zero) Au...,High,1.3.6.1.4.1.25623.1.0.103840,VendorFix,0.60674,0.98133,CVE-2013-4782,"[CVE-2014-2955, CVE-2013-4782, CVE-2013-4784, ...",4
4,2,IPMI Cipher Suite 0 (Cipher Zero) Authenticati...,2025-04-03,**Name**: IPMI Cipher Suite 0 (Cipher Zero) Au...,High,1.3.6.1.4.1.25623.1.0.103840,VendorFix,0.60674,0.98133,CVE-2013-4782,"[CVE-2014-2955, CVE-2013-4782, CVE-2013-4784, ...",4
...,...,...,...,...,...,...,...,...,...,...,...,...
995,533,robot.txt / robots.txt Exists on the Web Serve...,2025-04-03,**Name**: robot.txt / robots.txt exists on the...,Info,1.3.6.1.4.1.25623.1.0.10302,Mitigation,,,,,0
996,813,robot.txt / robots.txt Exists on the Web Serve...,2025-04-03,**Name**: robot.txt / robots.txt exists on the...,Info,1.3.6.1.4.1.25623.1.0.10302,Mitigation,,,,,0
997,669,robot.txt / robots.txt Exists on the Web Serve...,2025-04-03,**Name**: robot.txt / robots.txt exists on the...,Info,1.3.6.1.4.1.25623.1.0.10302,Mitigation,,,,,0
998,489,robot.txt / robots.txt Exists on the Web Serve...,2025-04-03,**Name**: robot.txt / robots.txt exists on the...,Info,1.3.6.1.4.1.25623.1.0.10302,Mitigation,,,,,0


In [143]:
df_features = merge_features(df_features, cve_dataset)
df = merge_tables(df_features, df_votes)

In [146]:
for i in df['description']:
    print(i)
    

**Name**: Apache Hadoop 'Secure Mode' Disabled
**Host**: 150.164.203.14
**Hostname**: None
**Port**: 5020/tcp
**NVT**: None
**Threat**: High
**Severity**: 10.0
**QOD**: None
**Description**: None
**Name**: IPMI 'No Auth' Access Mode Enabled (IPMI Protocol)
**Host**: 150.164.203.180
**Hostname**: aspartato.speed.dcc.ufmg.br
**Port**: 623/udp
**NVT**: None
**Threat**: High
**Severity**: 10.0
**QOD**: None
**Description**: The remote IPMI service has the 'No Auth' access mode enabled.

**Name**: IPMI 'No Auth' Access Mode Enabled (IPMI Protocol)
**Host**: 150.164.203.199
**Hostname**: eris.speed.dcc.ufmg.br
**Port**: 623/udp
**NVT**: None
**Threat**: High
**Severity**: 10.0
**QOD**: None
**Description**: The remote IPMI service has the 'No Auth' access mode enabled.

**Name**: IPMI Cipher Suite 0 (Cipher Zero) Authentication Bypass Vulnerability (IPMI Protocol)
**Host**: 150.164.203.162
**Hostname**: None
**Port**: 623/udp
**NVT**: None
**CVEs**: CVE-2013-4782, CVE-2013-4783, CVE-2013-478

In [123]:
df_os = df[df['title'].apply(lambda x: 'Operating System (OS) End of Life (EOL) Detection' in str(x))]
df_keyex = df[df['title'].apply(lambda x: 'Weak Key Exchange (KEX) Algorithm(s) Supported (SSH)' in str(x))]
df_weakalgo = df[df['title'].apply(lambda x: 'Weak Host Key Algorithm(s) (SSH)' in str(x))]

df_merged = pd.concat([df_os, df_keyex, df_weakalgo], ignore_index=True)

In [124]:
df_merged

Unnamed: 0,id,title,date,description,severity,vuln_id_from_tool,mitigation,epss_score,epss_percentile,cve,cve_list,num_vulns,cvss_score,num_cwes,num_cpes,remote_code_execution,privilege_escalation,information_disclosure,denial_of_service,buffer_overflow,cross_site_request_forgery,sql_injection,cross_site_scripting,user_id,vote_class,timestamp
0,7,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,,,
1,11,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,,,
2,9,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,,,
3,26,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,,,
4,23,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,1.0,Critical,2025-04-02T18:27:02.044634+00:00
5,12,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,1.0,Critical,2025-04-02T18:27:04.096219+00:00
6,17,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,1.0,Critical,2025-04-02T18:27:00.749725+00:00
7,4,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,1.0,Critical,2025-04-02T18:27:05.616643+00:00
8,15,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,,,
9,8,Operating System (OS) End of Life (EOL) Detect...,2025-04-02,**Name**: Operating System (OS) End of Life (E...,High,1.3.6.1.4.1.25623.1.0.103674,Mitigation,0.0,0.0,,,0,0.0,0.0,0.0,,,,,,,,,1.0,Critical,2025-04-02T18:27:06.740227+00:00


In [125]:
df_drop = df_merged.drop(columns=['date', 'id', 'title', 'cve', 'cve_list', 'description', 'severity', 'user_id', 'timestamp'])

In [126]:
df_drop.columns

Index(['vuln_id_from_tool', 'mitigation', 'epss_score', 'epss_percentile',
       'num_vulns', 'cvss_score', 'num_cwes', 'num_cpes',
       'remote_code_execution', 'privilege_escalation',
       'information_disclosure', 'denial_of_service', 'buffer_overflow',
       'cross_site_request_forgery', 'sql_injection', 'cross_site_scripting',
       'vote_class'],
      dtype='object')

In [127]:
df = df_drop.copy()

In [128]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [129]:
le = LabelEncoder()
df['vote_class_encoded'] = le.fit_transform(df['vote_class'].astype(str))

# Converter colunas categóricas para tipo 'category'
df['vuln_id_from_tool'] = df['vuln_id_from_tool'].astype('category')
df['mitigation'] = df['mitigation'].astype('category')

# Separar features e target
X = df.drop(columns=['vote_class', 'vote_class_encoded'])
y = df['vote_class_encoded']

In [130]:
mask_train = df['vote_class'].notna()
X_train, y_train = X[mask_train], y[mask_train]

In [131]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', enable_categorical=True)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [132]:
mask_pred = df['vote_class'].isna()
df.loc[mask_pred, 'vote_class_encoded'] = model.predict(X[mask_pred])

In [133]:
df['vote_class_predicted'] = le.inverse_transform(df['vote_class_encoded'].astype(int))

In [134]:
df_resultado = pd.concat([df_merged, df], axis=1)
df_final = df_resultado[['title', 'vote_class', 'vote_class_predicted']]


In [135]:
df_final

Unnamed: 0,title,vote_class,vote_class.1,vote_class_predicted
0,Operating System (OS) End of Life (EOL) Detect...,,,Critical
1,Operating System (OS) End of Life (EOL) Detect...,,,Critical
2,Operating System (OS) End of Life (EOL) Detect...,,,Critical
3,Operating System (OS) End of Life (EOL) Detect...,,,Critical
4,Operating System (OS) End of Life (EOL) Detect...,Critical,Critical,Critical
5,Operating System (OS) End of Life (EOL) Detect...,Critical,Critical,Critical
6,Operating System (OS) End of Life (EOL) Detect...,Critical,Critical,Critical
7,Operating System (OS) End of Life (EOL) Detect...,Critical,Critical,Critical
8,Operating System (OS) End of Life (EOL) Detect...,,,Critical
9,Operating System (OS) End of Life (EOL) Detect...,Critical,Critical,Critical
