# This notebook solves a problem of Token Classification
## Referring to: https://huggingface.co/course/chapter7/2?fw=pt

### Now, use the hyper-parameters from the seeds before to train the "best" model 
#### For this model, we will use only a training set (stopping condition will be average stopping epoch from 5 seeds)

In [1]:
import numpy as np
stopping_epochs = 47 #Empirically found with grid-search

Final model will keep training up to epoch: 47


In [2]:
# To avoid warnings
import warnings
import datasets
warnings.simplefilter(action='ignore', category=FutureWarning)
datasets.utils.logging.set_verbosity(datasets.utils.logging.ERROR)
datasets.utils.logging.enable_progress_bar()

import transformers
transformers.utils.logging.set_verbosity(transformers.utils.logging.ERROR)
transformers.utils.logging.enable_progress_bar()

### First import the dataset

In [3]:
import pandas as pd
PATH = "../Dataset/Training/Supervised/"
dataset = pd.read_csv(f"{PATH}token_classification.csv")
dataset.head(2)

Unnamed: 0,tokenized_session,tokens_labels,session_id,order_id,indexes_statements_context,indexes_words_context
0,"['which', 'awk', ';', 'echo', '6z18a0jzqrz1', ...","['Discovery', 'Discovery', 'Discovery', 'Disco...",135,1,[],[]
1,"['ps', '-x', ';', 'ps', '-x', ';', 'cat', '/pr...","['Discovery', 'Discovery', 'Discovery', 'Disco...",188,1,[],[]


#### Shuffle the dataset, but use only ONE partition (training)
##### We will use the session_ids here

In [4]:
seed = 1 # Seed used to shuffle dataset
shuffled_indexes = dataset.session_id.drop_duplicates().sample(frac = 1, random_state = seed)
print(f"Training set: {shuffled_indexes.shape[0]} (entire dataset)")

Training set: 359 (entire dataset)


#### Create partitions
##### Back to the original sessions (keeping subsessions with the same session_id together)

In [5]:
train_df = dataset[dataset['session_id'].isin(shuffled_indexes)]
print(f"Training df: {train_df.shape[0]}")

Training df: 597


#### Now create huggingface dataset

In [6]:
from datasets import DatasetDict, Dataset
labeled_dataset = Dataset.from_pandas(train_df)
labeled_dataset = labeled_dataset.remove_columns(['__index_level_0__'])
labeled_dataset

Dataset({
    features: ['tokenized_session', 'tokens_labels', 'session_id', 'order_id', 'indexes_statements_context', 'indexes_words_context'],
    num_rows: 597
})

### Read labels

In [7]:
with open("../Dataset/Training/Supervised/labels.txt", "r") as f:
    labels = [el.strip() for el in f.readlines() if el.strip()!=""]

In [8]:
labels

['Execution',
 'Discovery',
 'Persistence',
 'Harmless',
 'Defense Evasion',
 'Impact',
 'Other']

### Create features

In [9]:
from datasets import Features, Sequence, Value, ClassLabel
features = Features(
    {
        'tokenized_session': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
        'tokens_labels': Sequence(feature = ClassLabel(num_classes=len(labels), names=labels)),
        'indexes_statements_context':  Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
        'indexes_words_context':  Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
        'session_id': Value(dtype='int32', id=None),
        'order_id': Value(dtype='int32', id=None),
    }
)

In [10]:
import ast
# load the dataset and copy the features > "tokenized_session": ast.literal_eval(ex["tokenized_session"]),
def process(ex):
    return {"tokenized_session": ast.literal_eval(ex["tokenized_session"]), 
            "tokens_labels": ast.literal_eval(ex["tokens_labels"]), 
            "session_id": int(ex["session_id"]),
            "order_id": int(ex["order_id"]),
            "indexes_statements_context": ast.literal_eval(ex["indexes_statements_context"]),
            "indexes_words_context": ast.literal_eval(ex["indexes_words_context"])
           }
labeled_dataset = labeled_dataset.map(process, features=features)

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

In [11]:
labeled_dataset

Dataset({
    features: ['tokenized_session', 'tokens_labels', 'indexes_statements_context', 'indexes_words_context', 'session_id', 'order_id'],
    num_rows: 597
})

### Let's load a model now
#### Tokenizer first

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

### Expand labels to new tokens

#### This version labels all subtokens with the original word's label
##### Notice that we also pass the context indexes: those tokens are **not** going to be labeled

In [14]:
def align_labels_with_tokens(labels, word_ids, indexes_context_words):
    new_labels = []
    previous_word = None
    for word_id in word_ids:
        if word_id is None:
            # Special token
            new_labels.append(-100)
        elif word_id in indexes_context_words:
            # Part of context words I don't want to label
            new_labels.append(-100)
        elif word_id != previous_word:
            # Start of a new word!
            previous_word = word_id
            label = labels[word_id]
            new_labels.append(label)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)
    return new_labels

### Do it for the entire dataset now

In [15]:
def tokenize_and_align_labels(examples, max_length = tokenizer.model_max_length):
    tokenized_inputs = tokenizer(
        examples["tokenized_session"], truncation=True, is_split_into_words=True, max_length = max_length
    )
    all_labels = examples["tokens_labels"]
    list_context_words = examples["indexes_words_context"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        context_words = list_context_words[i]
        new_labels.append(align_labels_with_tokens(labels, word_ids, context_words))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [16]:
tokenized_datasets = labeled_dataset.map(
    tokenize_and_align_labels,
    fn_kwargs = {"max_length" : tokenizer.model_max_length},
    batched=True,
    remove_columns=labeled_dataset.column_names,
)

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

### Now create DataCollator

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

### Eventually, define the model

In [18]:
classes = labeled_dataset.features["tokens_labels"]
label_names = classes.feature.names

In [19]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [20]:
id2label

{0: 'Execution',
 1: 'Discovery',
 2: 'Persistence',
 3: 'Harmless',
 4: 'Defense Evasion',
 5: 'Impact',
 6: 'Other'}

In [21]:
label2id

{'Execution': 0,
 'Discovery': 1,
 'Persistence': 2,
 'Harmless': 3,
 'Defense Evasion': 4,
 'Impact': 5,
 'Other': 6}

### Create PyTorch DataLoader 
#### Necessary, since we want to customize out training loop

In [22]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    torch.backends.cudnn.benchmark = True

In [23]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    tokenized_datasets,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

In [24]:
from transformers import AutoModelForTokenClassification

model_name = "./Finetuned_model/codebert-base_bash_finetuned/tokenizer_pretrained_epochs_5_padded_256/"
config_model = f"{model_name}/config.json"

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
    config = config_model
).to(device)

In [25]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-6)

### Define parameters for early stopping

In [27]:
patience_lr_scheduler = 2

In [28]:
#from transformers import get_scheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

num_train_epochs = stopping_epoch
num_update_steps_per_epoch = len(dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = ReduceLROnPlateau(optimizer, 'min', patience = patience_lr_scheduler, verbose = True, min_lr = 1e-8)

#### Again, we do not want warnings

In [29]:
import transformers
transformers.utils.logging.set_verbosity(transformers.utils.logging.ERROR)
transformers.utils.logging.enable_progress_bar()

In [30]:
%%time
import numpy as np
from tqdm import tqdm
from copy import copy
import torch
import evaluate

for epoch in range(num_train_epochs):
    print(f"Epoch {epoch}...", end = "\r")
    batch_loss_training = []
    # Training
    model.train()

    for local_batch in dataloader:
        batch = local_batch.to(device)
        outputs = model(**batch)
        batch_loss_training.append(outputs.loss.item())
        outputs.loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    # Make a scheduler step    
    lr_scheduler.step(np.mean(batch_loss_training))

Epoch 46...

### Save model

##### Model

In [31]:
os.makedirs(f"./Trained_Model", exist_ok = True)
model.save_pretrained(f"./Trained_Model/CodeBERT")