# Mapping CVE entries to MITRE ATT&CK Framework 

In [None]:
from IPython.display import clear_output

In [None]:
! pip install transformers
! rm -rf /content/*
! wget https://attack.mitre.org/docs/enterprise-attack-v13.1/enterprise-attack-v13.1-techniques.xlsx
! mv /content/enterprise-attack-v13.1-techniques.xlsx /content/enterprise-techniques.xlsx
! gdown 105bU7r9ICrYGJ8CWdrq2gQ3BGxej2yYc
! gdown 1mYLUQNI3jRGkABVCwFvhx95QcX6YOkEL
! gdown 1-snCfXjRyGh_Kuu_QKAPhTEEa_Wt-BHH
! gdown 1lmCSmhS2h1EdQznMX-HuxHSkkkF0Yo0g
clear_output()

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [None]:
enterpriseTechniquesDF = pd.read_excel("/content/enterprise-techniques.xlsx")
cveDF = pd.read_csv("/content/cve.csv")
cveDF.rename(columns={"Unnamed: 0" : "CVE"}, inplace=True)
enterpriseTechniquesDF.dropna(axis=0, how="any", subset=["description"], inplace=True)
cveDF.dropna(axis=0, how="any", subset=["summary"], inplace=True)

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask

    def __len__(self):
        assert len(self.input_ids)==len(self.attention_mask)
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]


In [None]:
class EmbeddingHelper:
    def __init__(self, device="cpu", model_path="ehsanaghaei/SecureBERT", batch_size=16):
        self.device = device
        self.cpu = "cpu"
        self.batch_size = batch_size

        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path)

    def GenerateEmbeddings(self, inputs):
        numBatches:int = int(len(inputs) / self.batch_size)
        if(len(inputs) % self.batch_size!=0):
            numBatches += 1

        input_ids = []
        attention_mask = []
        for batch in tqdm(range(numBatches)):
            encoding = self.tokenizer.batch_encode_plus(inputs[batch * self.batch_size : (batch+1) * self.batch_size],
                                                        add_special_tokens=False,          
                                                        max_length=128,
                                                        truncation=True, 
                                                        padding="max_length",
                                                        return_attention_mask=True,
                                                        return_tensors="pt")
            
            input_ids += encoding["input_ids"]
            attention_mask += encoding["attention_mask"]

        dataset = EmbeddingDataset(input_ids, attention_mask)
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                shuffle=False)

        outputEmbedding = []
        self.model.to(self.device).eval()

        for index, (input_ids, attention_mask) in tqdm(enumerate(dataloader), total=len(dataloader)):
            input_ids = input_ids.to(self.device)
            attention_mask = attention_mask.to(self.device)

            sampleEmbedding = torch.mean(self.model(input_ids, attention_mask)[0], 1).to(self.cpu)
            for x in sampleEmbedding:
                outputEmbedding.append(x)
        
        return outputEmbedding

In [None]:
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "ehsanaghaei/SecureBERT"
batch_size = 16

helper = EmbeddingHelper(device=device,
                         model_path=model_path,
                         batch_size=batch_size)

enterpriseTechniqueDescriptions = enterpriseTechniquesDF["description"].values.tolist()
enterpriseTechniqueEmbeddings = helper.GenerateEmbeddings(enterpriseTechniqueDescriptions)
enterpriseTechniquesDF["embeddings"] = enterpriseTechniqueEmbeddings

cveDescriptions = cveDF["summary"].values.tolist()
cveEmbeddings = helper.GenerateEmbeddings(cveDescriptions)
cveDF["embeddings"] = cveEmbeddings

Due to low memory, had to split the data into three parts and generate embeddings separately

In [None]:
emb1 = np.load("/content/emb1.npy")
emb2 = np.load("/content/emb2.npy")
emb3 = np.load("/content/emb3.npy")

In [None]:
emb1 = np.append(np.append(emb1, emb2, axis=0), emb3, axis=0)

In [None]:
cveSummary = cveDF["summary"].values.tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ehsanaghaei/SecureBERT")
model = AutoModel.from_pretrained("ehsanaghaei/SecureBERT")

In [None]:
def embed(sample):
    encoding = tokenizer.encode_plus(sample,
                                add_special_tokens=False,          
                                max_length=128,
                                truncation=True, 
                                padding="max_length",
                                return_attention_mask=True,
                                return_tensors="pt")
    sample_embedding = torch.mean(model(**encoding)[0], 1)
    return sample_embedding

In [None]:
CosineSimilarity = nn.CosineSimilarity(dim = 0)

In [None]:
mitre = []
mitre_sim = []
for j in tqdm(range(len(cveDF))):   
    cve_sample_embedding = embed(cveSummary[j]).detach().numpy()[0]
    max_idx = 0
    max_sim = -1
    for i in range(607):
        sim = CosineSimilarity(torch.tensor(emb1[i]), torch.tensor(cve_sample_embedding))
        if(sim > max_sim):
            max_sim = sim
            max_idx = i
    mitre.append(max_idx)
    mitre_sim.append(max_sim)

In [None]:
for i in range(100):
    print("Sample", i)
    print(cveDF["CVE"].values[i])
    print(cveSummary[i])
    print()
    print(enterpriseTechniquesDF["description"].values[mitre[i]])
    print()
    print(mitre_sim[i])
    print()
    print("-" * 100)