In [16]:
# Initial coverage: 14.73665% Final coverage: 14.78238%
import json
import logging
import os
import pickle
import random
import sys
import traceback
from datetime import datetime
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import tqdm
from optimum.bettertransformer import BetterTransformer
from torch import optim
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import RobertaConfig, RobertaForMaskedLM

from rl.dqn import DQN, ReplayMemory
from rl.env import FuzzingEnv
from rl.fuzzing_action import FuzzingAction
from rl.tokenizer import ASTTokenizer
from utils.logging import setup_logging

sys.setrecursionlimit(20000)
setup_logging()
MAX_FRAGMENT_SEQ_LEN = 512  # Maximum length of the AST fragment sequence

PAD_TOKEN = "<pad>"
CLS_TOKEN = "<s>"
SEP_TOKEN = "</s>"
MASK_TOKEN = "<mask>"
UNK_TOKEN = "<unk>"

In [17]:
import pickle
import tqdm

with open("../ASTBERTa/frag_data_new.pkl", "rb") as f:
    frag_data = pickle.load(f)

with open("../ASTBERTa/vocab_data_new.pkl", "rb") as f:
    vocab_data = pickle.load(f)


frag_seqs = frag_data["frag_seqs"]
frag_id_to_type = frag_data["frag_id_to_type"]
frag_id_to_frag = frag_data["frag_id_to_frag"]
frag_type_to_id = frag_data["frag_type_to_id"]

vocab = vocab_data["vocab"]
token_to_id = vocab_data["token_to_id"]
id_to_token = vocab_data["id_to_token"]
special_token_ids = vocab_data["special_token_ids"]

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)  # size of vocabulary
intermediate_size = 3072  # embedding dimension
hidden_size = 768

num_hidden_layers = 6
num_attention_heads = 12
dropout = 0.1

config = RobertaConfig(
    vocab_size=vocab_size,
    hidden_size=hidden_size,
    num_hidden_layers=num_hidden_layers,
    num_attention_heads=num_attention_heads,
    intermediate_size=intermediate_size,
    hidden_dropout_prob=dropout,
    max_position_embeddings=MAX_FRAGMENT_SEQ_LEN + 2,
)


# # Load the ASTBERTa model
tokenizer = ASTTokenizer(vocab, token_to_id, MAX_FRAGMENT_SEQ_LEN, device)
pretrained_model = torch.load("../ASTBERTa/models/2023-06-13T18:35:.286473/model_18500.pt")
# pretrained_model = torch.load("../ASTBERTa/models/final/model_27500.pt")


if isinstance(pretrained_model, torch.nn.DataParallel):
    pretrained_model = pretrained_model.module

# ast_net = RobertaForMaskedLM.from_pretrained(
#     "../ASTBERTa/models/new/checkpoint-35000"
# ).to(device)

ast_net = RobertaForMaskedLM.from_pretrained(
    pretrained_model_name_or_path=None,
    state_dict=pretrained_model.state_dict(),
    config=config,
).to(device)

ast_net = BetterTransformer.transform(ast_net)

In [19]:
from js_ast.fragmentise import hash_frag


def tokenize(frag_seq):
    frag_id_seq: list[int] = []
    frag_id_seq.append(token_to_id[CLS_TOKEN])

    for frag in frag_seq:
        frag_hash = hash_frag(frag)
        if frag_hash in token_to_id:
            frag_id_seq.append(token_to_id[frag_hash])
        else:
            oov_frag: dict[str, str] = {"type": frag["type"]}
            oov_frag_hash = hash_frag(oov_frag)
            if oov_frag_hash in token_to_id:
                frag_id_seq.append(token_to_id[oov_frag_hash])
            else:
                print(f"UNK_TOKEN: {frag_hash}")
                frag_id_seq.append(token_to_id[UNK_TOKEN])

        if len(frag_id_seq) >= MAX_FRAGMENT_SEQ_LEN:
            break

    if len(frag_id_seq) < MAX_FRAGMENT_SEQ_LEN:
        frag_id_seq.append(token_to_id[SEP_TOKEN])


    random_start_idx = random.randint(1, len(frag_id_seq) - 1)
    frag_id_seq = [token_to_id[CLS_TOKEN]] + frag_id_seq[
        random_start_idx : random_start_idx + MAX_FRAGMENT_SEQ_LEN - 1
    ]

    return torch.tensor([frag_id_seq], dtype=torch.long)

In [20]:
acc = []

for i, seq in (bar := tqdm.tqdm(enumerate(frag_seqs), total=len(frag_seqs))):
    labels = tokenize(seq).to(device)
    inputs = labels.clone()
    attention_mask = torch.ones_like(inputs, device=device)

    probability_matrix = torch.full(labels.shape, 0.05)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100
    inputs[masked_indices] = token_to_id[MASK_TOKEN]

    out = ast_net(input_ids=inputs, attention_mask=attention_mask)

    criterion = nn.CrossEntropyLoss()
    loss = criterion(
        out.logits.view(-1, vocab_size), labels.view(-1)
    )

    if torch.isnan(loss):
        continue

    acc.append(loss.item())
    if i % 100 == 0:
        bar.set_postfix({"acc": sum(acc) / len(acc)})
        acc = []

  9%|▉         | 1290/14017 [00:07<01:16, 167.08it/s, acc=12.3]


KeyboardInterrupt: 

In [None]:
print(sum(acc) / len(acc))

0.2131926783871443


The model has 32,108,800 trainable parameters
RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(20000, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-3): 4 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, 