# Preparing the environment
Loading models and tokenizers for OpTrans.

In [1]:
import torch
import json
from optrans import tokenize_function
from transformers import AutoModel, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("sandspeare/optrans", trust_remote_code=True)
tokenizer.pad_token = tokenizer.unk_token

encoder = AutoModel.from_pretrained("sandspeare/optrans", trust_remote_code=True).to(device)
file = "./CaseStudy/casestudy.json"

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Encode the binary code

In [2]:

with open(file) as fp:
    data = json.load(fp)

print(data)
with torch.no_grad():
    asm_O0 = tokenize_function(tokenizer, data["O0"])
    asm_O0_inline = tokenize_function(tokenizer, data["O0_inline"])
    asm_O3 = tokenize_function(tokenizer, data["O3"])


    asm_embedding_O0 = encoder(**tokenizer.pad([asm_O0], padding=True, pad_to_multiple_of=8, return_tensors="pt", verbose=False).to(device))
    asm_embedding_O0_inline = encoder(**tokenizer.pad([asm_O0_inline], padding=True, pad_to_multiple_of=8, return_tensors="pt", verbose=False).to(device))
    asm_embedding_O3 = encoder(**tokenizer.pad([asm_O3], padding=True, pad_to_multiple_of=8, return_tensors="pt", verbose=False).to(device))

print(asm_embedding_O0)
print(asm_embedding_O0_inline)
print(asm_embedding_O3)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'O0': {'$loc_1$': ['$op_m_call$', '$function$0', '<', '$8.arg$0', '>'], '$loc_2$': ['$op_m_mov$', '$op_m_icall$', 'cs', '[', 'ds', '(', '[', 'ds', '[', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x68', ')', ']', ']', '$op_m_add$', '0x68', ')', ']', '<', '[', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x68', ')', ']', ',', '$op_m_xdu$', '$4.arg$1', ',', '[', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x48', ')', ']', ',', '[', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x50', ')', ']', '>', '$8.result$'], '$loc_3$': ['$ret$']}, 'O0_inline': {'$loc_1$': ['$op_m_mov$', '$8.arg$0', '$8.stack$5'], '$loc_2$': ['$op_m_add$', '$8.arg$0', '0x88', '$8.stack$3'], '$loc_3$': ['$op_m_ldx$', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x80', ')', '$8.stack$4'], '$loc_4$': ['$op_m_low.4$', '$op_m_icall$', 'cs', '[', 'ds', '(', '[', 'ds', '[', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x68', ')', ']', ']', '$op_m_add$', '0x10', ')', ']', '<', '[', 'ds', '(', '$8.arg$0', '$op_m_add$', '0x68', ')', ']', '>', '$4.stack$0'], '$loc_5$

# Perform similarity comparison

In [3]:
sim_O0vsO3 = torch.mm(asm_embedding_O0, asm_embedding_O3.T).squeeze() / 0.07
sim_O0_inlinevsO3 = torch.mm(asm_embedding_O0_inline, asm_embedding_O3.T).squeeze() / 0.07

category_scores = torch.tensor([sim_O0vsO3, sim_O0_inlinevsO3], device=device)  
  
probabilities = torch.nn.functional.softmax(category_scores, dim=0).squeeze(0).tolist()
  
print("Probability: O0 vs O3 = ", round(probabilities[0], 3))
print("Probability: O0_inline vs O3 = ", round(probabilities[1], 3))



Probability: O0 vs O3 =  0.079
Probability: O0_inline vs O3 =  0.921
