In [1]:
!pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords

In [4]:
# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
states = ["B-ENV", "I-ENV", "B-ATTACK", "I-ATTACK", "B-ATTACK_VECTOR", "I-ATTACK_VECTOR", "B-PRE_REQ", "I-PRE_REQ", "B-OUTCOME", "I-OUTCOME", "O"]
n_states = len(states)
n_states

11

In [7]:
id2state = {}
state2id = {}

for i, state in enumerate(states):
    id2state[i] = state
    state2id[state] = i

id2state, state2id

({0: 'B-ENV',
  1: 'I-ENV',
  2: 'B-ATTACK',
  3: 'I-ATTACK',
  4: 'B-ATTACK_VECTOR',
  5: 'I-ATTACK_VECTOR',
  6: 'B-PRE_REQ',
  7: 'I-PRE_REQ',
  8: 'B-OUTCOME',
  9: 'I-OUTCOME',
  10: 'O'},
 {'B-ENV': 0,
  'I-ENV': 1,
  'B-ATTACK': 2,
  'I-ATTACK': 3,
  'B-ATTACK_VECTOR': 4,
  'I-ATTACK_VECTOR': 5,
  'B-PRE_REQ': 6,
  'I-PRE_REQ': 7,
  'B-OUTCOME': 8,
  'I-OUTCOME': 9,
  'O': 10})

In [8]:
import json
with open("/kaggle/input/cve-1-0/labeled_dataset.json", "r") as f: dataset = json.load(f)

In [9]:
import re

def trim_non_alphanumeric(s):
    return re.sub(r'^\W+|\W+$', '', s)

In [13]:
X = []
y = []
for V in dataset:
    try:
        annotated_tokens = V["labeled_description"]
        tokens = []
        labels = []
        for token in annotated_tokens:    
            if (token[0] == ''):
                continue
            if (len(token) != 2):
                continue
            t = trim_non_alphanumeric(token[0])
            if (t == ''):
                continue
            if (token[1] == 'B-ATTACK_TYPE' or token[1] == 'I-ATTACK_TYPE' or token[1] == 'B-ATTACK_DESCRIPTION' or token[1] == 'I-ATTACK_DESCRIPTION'):
                continue
            tokens.append(t)
            labels.append(token[1])
        X.append(tokens)
        y.append(labels)
    except Exception as e:
            pass

In [14]:
print(len(X))

2269


In [15]:
X[0], y[0]

(['A',
  'cross-site',
  'scripting',
  'XSS',
  'vulnerability',
  'in',
  'ONLYOFFICE',
  'Document',
  'Server',
  'Example',
  'before',
  'v7.0.0',
  'allows',
  'remote',
  'attackers',
  'inject',
  'arbitrary',
  'HTML',
  'or',
  'JavaScript',
  'through',
  'example/editor'],
 ['O',
  'B-ATTACK',
  'I-ATTACK',
  'I-ATTACK',
  'O',
  'O',
  'B-ENV',
  'I-ENV',
  'I-ENV',
  'I-ENV',
  'I-ENV',
  'I-ENV',
  'O',
  'B-ATTACK_VECTOR',
  'I-ATTACK_VECTOR',
  'O',
  'B-OUTCOME',
  'I-OUTCOME',
  'I-OUTCOME',
  'I-OUTCOME',
  'O',
  'B-ATTACK_VECTOR'])

In [16]:
def is_multiword_token(token):
    # Check for hyphens, underscores, or camel case
    return bool(re.search(r'[-_]|[a-z]+[A-Z]', token))

In [17]:
def is_url(token):
    # Regular expression to match both web URLs and file paths
    url_pattern = re.compile(
        r'^(https?://|ftp://|file://|www\.)|'  # Match protocol or www
        r'([a-zA-Z]:[\\/])|'                   # Match Windows drive letter (e.g., C:\)
        r'([\\/][^/\\]+[\\/])'                 # Match file paths with slashes (Unix, Windows)
    )
    return bool(url_pattern.match(token))

In [18]:
def is_version_number(token):
    # Regular expression to match version numbers
    version_pattern = re.compile(r'^v?\d+(\.\d+)*$')
    return bool(version_pattern.match(token))

In [19]:
def token_to_features(tokens, i):
    token = tokens[i]
    features = {
        'bias': 1.0,
        'token.lower()': token.lower(),
        'token[-3:]': token[-3:],
        'token[-2:]': token[-2:],
        'token.isupper()': token.isupper(),
        'token.istitle()': token.istitle(),
        'token.isdigit()': token.isdigit(),
        'token.is_version_number()': is_version_number(token),
        'token.is_stop_word()': token.lower() in stop_words,
        'token.is_url()': is_url(token),
        'token.is_multiword()': is_multiword_token(token),
    }
    if i > 0:
        prev_token = tokens[i-1][0]
        features.update({
            '-1:token.lower()': prev_token.lower(),
            '-1:token.istitle()': prev_token.istitle(),
            '-1:token.isupper()': prev_token.isupper(),
            '-1:token.is_version_number()': is_version_number(prev_token),
            '-1:token.is_stop_word()': prev_token.lower() in stop_words,
            '-1:token.is_url()': is_url(prev_token),
            '-1:token.is_multiword()': is_multiword_token(prev_token),
        })
    else:
        features['BOS'] = True

    if i < len(tokens)-1:
        next_token = tokens[i+1]
        features.update({
            '+1:token.lower()': next_token.lower(),
            '+1:token.istitle()': next_token.istitle(),
            '+1:token.isupper()': next_token.isupper(),
            '+1:token.is_version_number()': is_version_number(next_token),
            '+1:token.is_stop_word()': next_token.lower() in stop_words,
            '+1:token.is_url()': is_url(next_token),
            '+1:token.is_multiword()': is_multiword_token(next_token),
        })
    else:
        features['EOS'] = True

    return features

In [20]:
def tokens_to_features(tokens):
    return [token_to_features(tokens, i) for i in range(len(tokens))]

In [21]:
idx = 5
print(tokens[idx])
print(token_to_features(tokens, idx))

2018.011.20063
{'bias': 1.0, 'token.lower()': '2018.011.20063', 'token[-3:]': '063', 'token[-2:]': '63', 'token.isupper()': False, 'token.istitle()': False, 'token.isdigit()': False, 'token.is_version_number()': True, 'token.is_stop_word()': False, 'token.is_url()': False, 'token.is_multiword()': False, '-1:token.lower()': 'v', '-1:token.istitle()': False, '-1:token.isupper()': False, '-1:token.is_version_number()': False, '-1:token.is_stop_word()': False, '-1:token.is_url()': False, '-1:token.is_multiword()': False, '+1:token.lower()': 'and', '+1:token.istitle()': False, '+1:token.isupper()': False, '+1:token.is_version_number()': False, '+1:token.is_stop_word()': True, '+1:token.is_url()': False, '+1:token.is_multiword()': False}


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train = [tokens_to_features(tokens) for tokens in X_train]

In [25]:
assert len(X_train[0]) == len(y_train[0])

In [26]:
X_test = [tokens_to_features(tokens) for tokens in X_test]

In [27]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [28]:
len(X_train)

1815

In [29]:
crf.fit(X_train, y_train)

In [41]:
labels = list(crf.classes_)
labels.remove('O')
labels


['B-ENV',
 'I-ENV',
 'B-ATTACK',
 'B-ATTACK_VECTOR',
 'I-ATTACK_VECTOR',
 'I-ATTACK',
 'B-OUTCOME',
 'I-OUTCOME',
 'B-PRE_REQ',
 'I-PRE_REQ']

In [42]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7608691759716834

In [32]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

       B-ATTACK      0.768     0.671     0.716       480
       I-ATTACK      0.627     0.677     0.651      1005
B-ATTACK_VECTOR      0.765     0.640     0.697       509
I-ATTACK_VECTOR      0.666     0.525     0.587       902
          B-ENV      0.665     0.720     0.691       760
          I-ENV      0.884     0.871     0.877      3623
      B-OUTCOME      0.716     0.685     0.700       375
      I-OUTCOME      0.851     0.785     0.817      1452
      B-PRE_REQ      0.667     0.599     0.631       157
      I-PRE_REQ      0.606     0.694     0.647       761

      micro avg      0.772     0.751     0.761     10024
      macro avg      0.721     0.687     0.702     10024
   weighted avg      0.775     0.751     0.761     10024



In [38]:
import joblib

In [47]:
joblib.dump(crf, "CRF_FOR_SEQUENCE_CLASSIFICATION1.sav")

['CRF_FOR_SEQUENCE_CLASSIFICATION1.sav']

In [33]:
sample_text = "Opencats v0.9.7 was discovered to contain a SQL injection vulnerability via the importID parameter in the Import viewerrors function."
sample_text = sample_text.split(" ")

In [36]:
labels = crf.predict([sample_text])

In [37]:
labels

array([['B-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK',
        'I-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK',
        'I-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK',
        'I-ATTACK', 'I-ATTACK', 'I-ATTACK', 'I-ATTACK']], dtype=object)

In [125]:
print(crf.predict([X[0]]))

[['I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION' 'I-ATTACK_DESCRIPTION'
  'I-ATTACK_DESCRIPTION']]


In [48]:
crf2 = joblib.load('/kaggle/working/CRF_FOR_SEQUENCE_CLASSIFICATION_BEST.sav')

In [49]:
print(crf2.predict([tokens_to_features(sample_text)]))

[['B-ENV' 'I-ENV' 'O' 'O' 'O' 'O' 'O' 'B-ATTACK' 'I-ATTACK' 'O' 'O' 'O'
  'O' 'O' 'O' 'O' 'O' 'O' 'O']]
