In [1]:
import tensorflow as tf

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords


In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
states = ["B-ENV", "I-ENV", "B-ATTACK", "I-ATTACK", "B-ATTACK_VECTOR", "I-ATTACK_VECTOR", "B-PRE_REQ", "I-PRE_REQ", "B-OUTCOME", "I-OUTCOME", "O"]
n_states = len(states)
n_states

11

In [6]:
id2state = {}
state2id = {}

for i, state in enumerate(states):
    id2state[i] = state
    state2id[state] = i

id2state, state2id

({0: 'B-ENV',
  1: 'I-ENV',
  2: 'B-ATTACK',
  3: 'I-ATTACK',
  4: 'B-ATTACK_VECTOR',
  5: 'I-ATTACK_VECTOR',
  6: 'B-PRE_REQ',
  7: 'I-PRE_REQ',
  8: 'B-OUTCOME',
  9: 'I-OUTCOME',
  10: 'O'},
 {'B-ENV': 0,
  'I-ENV': 1,
  'B-ATTACK': 2,
  'I-ATTACK': 3,
  'B-ATTACK_VECTOR': 4,
  'I-ATTACK_VECTOR': 5,
  'B-PRE_REQ': 6,
  'I-PRE_REQ': 7,
  'B-OUTCOME': 8,
  'I-OUTCOME': 9,
  'O': 10})

In [7]:
import json
with open("/kaggle/input/cve-1-0/labeled_dataset.json", "r") as f: dataset = json.load(f)

In [8]:
import re

def trim_non_alphanumeric(s):
    return re.sub(r'^\W+|\W+$', '', s)

In [9]:
X = []
y = []
for V in dataset:
    try:
        annotated_tokens = V["labeled_description"]
        tokens = ["<BOS>"]
        labels = [state2id["O"]]
        for token in annotated_tokens:    
            if (token[0] == ''):
                continue
            if (len(token) != 2):
                continue
            t = trim_non_alphanumeric(token[0])
            if (t == ''):
                continue
            if (token[1] == 'B-ATTACK_TYPE' or token[1] == 'I-ATTACK_TYPE' or token[1] == 'B-ATTACK_DESCRIPTION' or token[1] == 'I-ATTACK_DESCRIPTION'):
                continue
            tokens.append(t)
            labels.append(state2id[token[1]])
        X.append(tokens)
        y.append(labels)
    except Exception as e:
            pass

In [10]:
def is_multiword_token(token):
        # Check for hyphens, underscores, or camel case
    return bool(re.search(r'[-_]|[a-z]+[A-Z]', token))

def is_version_number(token):
    # Regular expression to match version numbers
    version_pattern = re.compile(r'^v?\d+(\.\d+)*$')
    return bool(version_pattern.match(token))
        
def is_url(token):
        # Regular expression to match both web URLs and file paths
    url_pattern = re.compile(
            r'^(https?://|ftp://|file://|www\.)|'  # Match protocol or www
            r'([a-zA-Z]:[\\/])|'                   # Match Windows drive letter (e.g., C:\)
            r'([\\/][^/\\]+[\\/])'                 # Match file paths with slashes (Unix, Windows)
    )
    return bool(url_pattern.match(token))
    

In [11]:
# wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove*.zip

In [12]:
import numpy as np
from numpy.linalg import norm
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [13]:
def load_glove():
    # Load pre-trained GloVe embeddings
    glove_input_file = 'glove.6B.300d.txt'
    word2vec_output_file = 'glove.6B.300d.word2vec'
    glove2word2vec(glove_input_file, word2vec_output_file)
    return KeyedVectors.load_word2vec_format(word2vec_output_file)

In [14]:
glove_model = load_glove()
reference_vector = glove_model['the']

  glove2word2vec(glove_input_file, word2vec_output_file)


In [15]:
def token_to_float(token):
    try:
        token_vector = glove_model[token.lower()]
        # Compute cosine similarity with reference vector
        similarity = np.dot(token_vector, reference_vector) / (norm(token_vector) * norm(reference_vector))
        # Convert similarity from [-1, 1] to [0, 1] range
        return (similarity + 1) / 2
    except KeyError:
        # Return default value if token not in vocabulary
        return 0.5

In [16]:
# rm  /kaggle/working/glove.6B.zip /kaggle/working/glove.6B.100d.txt /kaggle/working/glove.6B.200d.txt /kaggle/working/glove.6B.50d.txt

In [17]:
def get_features(token):
    return [1.0,
        token_to_float(token),
        token_to_float(token[-3:] if len(token) >= 3 else token),
        token_to_float(token[-2:] if len(token) >= 2 else token),
        float(token.isupper()),
        float(token.istitle()),
        float(token.isdigit()),
        float(is_version_number(token)),
        float(token.lower() in stop_words),
        float(is_url(token)),
        float(is_multiword_token(token)),
        float(token=="<BOS>")
            
    ]
    
    

In [18]:
def token_to_features(tokens):
    return [get_features(token) for token in tokens]

In [19]:
X = [token_to_features(tokens) for tokens in X]

In [20]:
n_features = len(X[0][0])
n_features

12

In [21]:
def pad_data(X, y):
    max_len = max(len(seq) for seq in X)
    pad = [-100.0 for i in range(n_features)]
    for i in range(len(X)):
        m = len(X[i])
        X[i].extend([pad for i in range(max_len - m)])
        y[i].extend([-100 for i in range(max_len - m)])    

In [22]:
pad_data(X, y)

In [23]:
len(X[0]), len(y[0])

(400, 400)

In [24]:
assert len(X[0]) == len(y[0])

In [25]:
import pickle

In [26]:
with open("/kaggle/working/X.pkl", "wb") as f: pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)
with open("/kaggle/working/y.pkl", "wb") as f: pickle.dump(y, f, pickle.HIGHEST_PROTOCOL)

In [71]:
class MEMM(tf.keras.Model):
    def __init__(self, n_states, n_features, state_ids, outside_id):
        super().__init__()
        self.n_states = n_states
        self.state_ids = state_ids
        self.outside_id = outside_id
        self.n_features = n_features
        self.W = tf.Variable(tf.random.uniform([1, self.n_features + 2]), trainable=True, name="weights", dtype=tf.float32)
        
    @tf.function
    def get_probability(self, x, curr_state):
        logits = tf.fill([self.n_states], 0.0)
        for i, prev_state in enumerate(self.state_ids):
            x_new = tf.concat([x, tf.expand_dims(tf.constant(prev_state, dtype=tf.float32), axis=0)], axis=0)
            x_new = tf.concat([x_new, tf.expand_dims(tf.cast(curr_state, dtype=tf.float32), axis=0)], axis=0)
            logits = tf.tensor_scatter_nd_update(
                logits,
                [[i]],
                [tf.squeeze(tf.matmul(tf.expand_dims(x_new, axis=0), tf.transpose(self.W)), axis=0)[0]]
                
            )
        return tf.nn.softmax(logits)
        
    @tf.function
    def get_unpad_length(self, x):
        pad_value = tf.fill([self.n_features], -100.0)
        T = tf.shape(x)[0]
        t = T - 1
        found = tf.constant(False)
        T_actual = T
        
        def condition(t, found, T_actual):
            return tf.logical_and(t >= 0, tf.logical_not(found))
        
        def body(t, found, T_actual):
            is_padding = tf.reduce_all(tf.math.equal(x[t], pad_value))
            
            # Update found and T_actual only if we find padding
            new_found = tf.logical_not(is_padding)
            new_T_actual = tf.cond(
                new_found,
                lambda: t + 1,
                lambda: T_actual
            )
            return t - 1, new_found, new_T_actual
        
        # Run the while loop
        _, _, final_T_actual = tf.while_loop(
            condition,
            body,
            loop_vars=[t, found, T_actual],
        )
        return final_T_actual

    @tf.function
    def neg_log_loss(self, x, y, **kwargs):
        T = self.get_unpad_length(x)
        loss = 0.0
        for t in range(1, T):
            prob = self.get_probability(x[t], y[t])
            loss -= tf.math.log(prob[y[t-1]])

        return loss
        
    @tf.function
    def compute_loss(self, x, y, **kwargs):
        individual_losses = tf.map_fn(
            lambda data: self.neg_log_loss(data[0], data[1]),
            (x, y),
            fn_output_signature=tf.float32
        )
        return tf.reduce_sum(individual_losses)

    @tf.function
    def viterbi(self, x):
        T = self.get_unpad_length(x)
        # print(T, "<<<", flush=True)
        N = self.n_states
        viterbi_matrix = tf.zeros([T, N], dtype=tf.float32)
        backpointer = tf.zeros([T, N], dtype=tf.int32)
        viterbi_matrix = tf.tensor_scatter_nd_update(
            viterbi_matrix,
            [[0]],
            [tf.fill([N], 0.0)]
        )
        viterbi_matrix = tf.tensor_scatter_nd_update(
            viterbi_matrix,
            [[0, self.outside_id]],
            [0.0]
        )
        for t in range(1, T):
            for s in range(N):
                transition = tf.math.log(self.get_probability(x[t], s))
                probs = (viterbi_matrix[t - 1] + transition)

                best_prev_state = tf.argmax(probs)
                viterbi_matrix = tf.tensor_scatter_nd_update(
                    viterbi_matrix,
                    [[t, s]],
                    [probs[best_prev_state]]
                )
                backpointer = tf.tensor_scatter_nd_update(
                    backpointer,
                    [[t, s]],
                    [best_prev_state]
                )

        best_path = tf.zeros(T, dtype=tf.int32)
        best_path = tf.tensor_scatter_nd_update(
            best_path,
            [[T-1]],
            [tf.argmax(viterbi_matrix[T-1])]
        )
        
        for t in range(T-2, 0, -1):
            best_path = tf.tensor_scatter_nd_update(
                best_path,
                [[t]],
                [backpointer[t+1, best_path[t+1]]]
            )
            
        return best_path
    
    @tf.function
    def call(self, inputs):
        return tf.map_fn(self.viterbi, inputs, fn_output_signature=tf.int32)
        
    @tf.function
    def train_step(self, data):
        batch_observations, batch_true_states = data
        with tf.GradientTape() as tape:
            loss = self.compute_loss(x=batch_observations, y = batch_true_states)
        # Compute gradients
        gradients = tape.gradient(loss, [self.W])
        # Apply gradients
        self.optimizer.apply_gradients(zip(gradients,[self.W]))
        
        return {"loss": loss}

In [40]:
temp = MEMM(n_states, n_features, list(id2state.keys()), state2id["O"])

In [41]:
print(temp.viterbi(tf.Variable(X[0], dtype=tf.float32)))

tf.Tensor([ 0 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10  0], shape=(23,), dtype=int32)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [72]:
memm = MEMM(n_states, n_features, list(id2state.keys()), state2id["O"])

In [73]:
memm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

In [74]:
memm.summary()

In [55]:
# XX_train = [X_train[i][:10] for i in range(5)]
# yy_train = [y_train[i][:10] for i in range(5)]

In [56]:
# XX_val = [X_val[i][:10] for i in range(5)]
# yy_val = [y_val[i][:10] for i in range(5)]

In [75]:
tf.config.run_functions_eagerly(False)

In [76]:
# memm.fit(tf.Variable(XX_train), tf.Variable(yy_train), epochs=20, validation_data=(tf.Variable(XX_val), tf.Variable(yy_val)))

In [None]:
memm.fit(tf.Variable(X_train), tf.Variable(y_train), epochs=20, validation_data=(tf.Variable(X_val), tf.Variable(y_val)))

Epoch 1/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1s/step - loss: 2984.5718 - val_loss: 2990.1157
Epoch 2/20
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 1s/step - loss: 2980.0420 - val_loss: 2992.0234
Epoch 3/20
[1m33/52[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m22s[0m 1s/step - loss: 3049.9458