In [1]:
# To avoid warnings
import warnings
import datasets
warnings.simplefilter(action='ignore', category=FutureWarning)
datasets.utils.logging.set_verbosity(datasets.utils.logging.ERROR)
datasets.utils.logging.enable_progress_bar()

import transformers
transformers.utils.logging.set_verbosity(transformers.utils.logging.ERROR)
transformers.utils.logging.enable_progress_bar()

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [22]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    torch.backends.cudnn.benchmark = True

### Import big unlabeled dataset

#### Choose which dataset

In [5]:
dataset = "Cyberlab"

In [6]:
import pandas as pd
inference_corpus = pd.read_csv(f"../Dataset/Inference/reduced_sessions.csv")
print(f"Inference corpus contains {inference_corpus.shape[0]} examples")
inference_corpus.head(2)

Inference corpus contains 771477 examples


Unnamed: 0,session_id,sessions,indexes_statements_context,order_id,indexes_words_context
0,0,enable ; system ; shell ; sh ; cat /proc/mount...,"[17, 16, 15, 14]",1,"[39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 5..."
1,0,tftp ; wget ; /bin/busybox SAEMW ; dd bs=52 co...,"[0, 1, 2, 3]",2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]"


### Obtain tokens

In [7]:
inference_corpus["tokenized_session"] = inference_corpus["sessions"].progress_apply(lambda sessions: sessions.split(" "))
inference_corpus.drop("sessions", axis = 1, inplace = True)
inference_corpus.head(2)

100%|██████████| 771477/771477 [00:07<00:00, 107143.92it/s]


Unnamed: 0,session_id,indexes_statements_context,order_id,indexes_words_context,tokenized_session
0,0,"[17, 16, 15, 14]",1,"[39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 5...","[enable, ;, system, ;, shell, ;, sh, ;, cat, /..."
1,0,"[0, 1, 2, 3]",2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[tftp, ;, wget, ;, /bin/busybox, SAEMW, ;, dd,..."


### Convert Pandas dataframe to Huggingface dataset

In [8]:
from datasets import Dataset
inference_dataset = Dataset.from_pandas(inference_corpus)
inference_dataset

Dataset({
    features: ['session_id', 'indexes_statements_context', 'order_id', 'indexes_words_context', 'tokenized_session'],
    num_rows: 771477
})

### Read labels

In [9]:
with open("../Dataset/Training/Supervised/labels.txt", "r") as f:
    labels = [el.strip() for el in f.readlines() if el.strip()!=""]

In [10]:
" - ".join(labels)

'Execution - Discovery - Persistence - Harmless - Defense Evasion - Impact - Other'

### Create features

In [11]:
from datasets import Features, Sequence, Value, ClassLabel
features = Features(
    {
        'tokenized_session': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
        'indexes_statements_context':  Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
        'indexes_words_context':  Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
        'session_id': Value(dtype='int32', id=None),
        'order_id': Value(dtype='int32', id=None),
    }
)

In [12]:
import ast
import multiprocessing
# load the dataset and copy the features > "tokenized_session": ast.literal_eval(ex["tokenized_session"]),
def process(ex):
    return {"tokenized_session": ex["tokenized_session"], 
            "session_id": int(ex["session_id"]),
            "order_id": int(ex["order_id"]),
            "indexes_statements_context": ast.literal_eval(ex["indexes_statements_context"]),
            "indexes_words_context": ast.literal_eval(ex["indexes_words_context"])
           }
inference_dataset = inference_dataset.map(process, features=features, num_proc= multiprocessing.cpu_count() - 1)

Map (num_proc=63):   0%|          | 0/771477 [00:00<?, ? examples/s]

#### Create mapping from classes to classes_id

In [13]:
id2label = {i: label for i, label in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}
id2label

{0: 'Execution',
 1: 'Discovery',
 2: 'Persistence',
 3: 'Harmless',
 4: 'Defense Evasion',
 5: 'Impact',
 6: 'Other'}

### Loading the tokenizer

In [14]:
from transformers import AutoTokenizer
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

### Tokenize dataset
#### First give it a shot with larger max size (1024)

In [15]:
def mask_context_tokens(context_words, word_ids, input_ids):
    tokens_to_keep = []
    previous_word = None
    for word_id, input_id in zip(word_ids, input_ids):
        if word_id is None:
            # Special token
            tokens_to_keep.append(-100)
        elif word_id in context_words:
            # Part of context words I don't want to label
            tokens_to_keep.append(-100)
        elif word_id == previous_word:
            # Same word as previous token
            tokens_to_keep.append(-100)
        else:
            # Start of a new word!
            previous_word = word_id
            tokens_to_keep.append(input_id)
    return tokens_to_keep

In [16]:
def tokenize_and_align_labels(examples, max_length):
    tokenized_inputs = tokenizer(
        examples["tokenized_session"], truncation=True, is_split_into_words=True, max_length = max_length
    )
    indexes_words_context = examples["indexes_words_context"]
    context_tokens = []
    for it in range(len(examples["tokenized_session"])):
        context_words = indexes_words_context[it]
        word_ids = tokenized_inputs.word_ids(it)
        input_ids = tokenized_inputs.input_ids[it]
        context_tokens.append(mask_context_tokens(context_words, word_ids, input_ids))
    # Later on, DataCollator will not complain and will pad as input_ids > that is why we kept the name
    tokenized_inputs["labels"] = context_tokens
    return tokenized_inputs

In [17]:
tokenized_datasets = inference_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=inference_dataset.column_names, 
    fn_kwargs = {"max_length" : 1024},
    num_proc= multiprocessing.cpu_count() - 1
)

Map (num_proc=63):   0%|          | 0/771477 [00:00<?, ? examples/s]

#### How many sessions got truncated?

In [18]:
from tqdm import tqdm
import re
truncated_sessions = 0 
truncated_tokens = 0
all_tokens = 0
for it in tqdm(range(inference_dataset.shape[0])):
    tokenized_session = tokenized_datasets[it]["input_ids"]
    n_tokens_session = len(tokenized_session)
    if n_tokens_session >= tokenizer.model_max_length:
        truncated_sessions += 1
        truncated_tokens += (n_tokens_session - tokenizer.model_max_length) 
    all_tokens += n_tokens_session
print(f"Number of truncated sessions: {truncated_sessions} ({truncated_sessions/inference_dataset.shape[0] * 100:.2f} %)")
print(f"Number of truncated tokens: {truncated_tokens} ({truncated_tokens/all_tokens * 100:.2f} %)")

100%|██████████| 771477/771477 [02:06<00:00, 6096.43it/s]

Number of truncated sessions: 9 (0.00 %)
Number of truncated tokens: 1025 (0.00 %)





### Now real tokenization:

In [19]:
tokenized_datasets = inference_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=['indexes_statements_context', 'order_id', 'indexes_words_context', 'tokenized_session'], 
    fn_kwargs = {"max_length" : tokenizer.model_max_length},
    num_proc= multiprocessing.cpu_count() - 1
)

Map (num_proc=63):   0%|          | 0/771477 [00:00<?, ? examples/s]

### Now create DataCollator

#### Among others, DataCollator pad all sessions to same length

In [20]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

### Create PyTorch DataLoader 
#### Necessary, since using model that does not have a AutoModelForTokenClassification

In [21]:
from torch.utils.data import DataLoader
inference_dataloader = DataLoader(
    tokenized_datasets, collate_fn=data_collator, batch_size=8
)

### Load the model now

In [22]:
from transformers import AutoModelForTokenClassification
model_name = "../Training/Trained_Model/CodeBERT"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
    config = model_name + "config.json"
).to(device)

### Start the inference

In [23]:
%%time 

from tqdm import tqdm
model.eval()
predictions_list = []
logits_list = []
session_ids_list = []

for local_batch in tqdm(inference_dataloader):
    with torch.no_grad():
        batch = local_batch.to(device)
        session_id = batch["session_id"]
        session_id = session_id.reshape(session_id.shape[0], 1)
        context_tokens = batch["labels"]
        del batch["labels"]
        del batch["session_id"]
        outputs = model(**batch)

    input_ids = batch.input_ids
    flatten_input_ids = input_ids.reshape(input_ids.shape[0] * input_ids.shape[1])
    flatten_context_tokens = context_tokens.reshape(context_tokens.shape[0] * context_tokens.shape[1])
    """
    Only keeping indexes which are not:
    - Special tokens (e.g., start token, end token, ...)
    - Context tokens (i.e., the ones we addes for the mitigation policy)
    """
    mask = flatten_context_tokens != -100
    #print(f"At the beginning, session id has shape: {session_id.shape}")
    expanded_session_id = session_id.expand(session_id.shape[0], input_ids.shape[1]).reshape(session_id.shape[0]*input_ids.shape[1])
    #print(f"After the expansion, session id has shape: {expanded_session_id.shape}")
    masked_session_id = expanded_session_id[mask]
    logits = outputs.logits
    flatten_logits = logits.reshape(logits.shape[0] * logits.shape[1], logits.shape[2])
    predicted_logits = flatten_logits[mask].max(dim=-1).values
    predictions = flatten_logits[mask].argmax(dim=-1)
    #print(f"After masking, session id has shape: {masked_session_id.shape}")
    #print(f"Notice that also the predictions has shape: {predictions.shape}")    
    predictions_list += list(predictions.detach().cpu().clone().numpy())
    logits_list += list(predicted_logits.detach().cpu().clone().numpy())
    session_ids_list += list(masked_session_id.detach().cpu().clone().numpy())

 54%|█████▍    | 52103/96435 [23:21<22:17, 33.14it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 96435/96435 [44:49<00:00, 35.86it/s]

CPU times: user 44min 42s, sys: 22 s, total: 45min 4s
Wall time: 44min 49s





#### Convert to arrays

In [24]:
import numpy as np
predictions, logits, sessions_ids = np.array(predictions_list), np.array(logits_list), np.array(session_ids_list)

### Import thresholds
#### we trained on 5 seeds -> average

In [25]:
df_thresholds = pd.read_csv("../Training/Trained_Model/thresholds.csv").rename({"Unnamed: 0":"Class"}, axis = 1)
df_thresholds.set_index('Class', inplace=True)
df_thresholds.head(2)

Unnamed: 0_level_0,mean,lower_interval,upper_interval
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Execution,0.570615,0.509742,0.631488
Discovery,0.604162,0.496116,0.712208


#### Import min max scalers

In [26]:
df_min_max = pd.read_csv("../Training/Trained_Model/min_max_scaler.csv")
min_training_logits = df_min_max["min_training_logits"].iloc[0]
max_training_logits = df_min_max["max_training_logits"].iloc[0]

In [27]:
df_model_output = pd.DataFrame(list(zip(predictions, sessions_ids, logits)), columns =['Predictions', 'Session_ids', 'Logits'])
print(f"We made {df_model_output.shape[0]} predictions")
df_model_output.head(2)

We made 28151440 predictions


Unnamed: 0,Predictions,Session_ids,Logits
0,1,0,7.045496
1,1,0,6.97562


In [28]:
def check_threshold(prediction, logit):
    normalized_logit = (logit - min_training_logits)/ max_training_logits
    if normalized_logit >= df_thresholds.loc[id2label[prediction]]["mean"]: 
        return int(prediction)
    else:
        return 10
df_model_output["new_predictions"] = df_model_output.progress_apply(lambda row: check_threshold(row.Predictions, row.Logits), axis = 1) 
df_model_output.head()

100%|██████████| 28151440/28151440 [20:16<00:00, 23135.83it/s]


Unnamed: 0,Predictions,Session_ids,Logits,new_predictions
0,1,0,7.045496,1
1,1,0,6.97562,1
2,1,0,7.038234,1
3,1,0,6.960073,1
4,1,0,7.041942,1


### Export results

In [29]:
import csv

In [30]:
df_model_output.to_csv(f"./predictions.csv", index=False, quoting=csv.QUOTE_ALL)

In [31]:
inference_corpus[["session_id", "order_id", "tokenized_session"]].to_csv(f"./to_join_with_predictions.csv", index=False, quoting=csv.QUOTE_ALL)