# Demo

## Prepare For the Data

In [1]:
from config import args
from pretrain.dataloader import BERT4ETHDataloader
from models.model import BERT4ETH
from pretrain.trainer import BERT4ETHTrainer
import pickle as pkl
from pretrain.vocab import FreqVocab

args.bizdate= '2024'


Epoch #: 200
Vocab #: 3000000
Hidden #: 64
Max Length: 100
ckpt_dir: outputs/cpkt_local
learning_rate: 0.0001
Max predictions per seq: 80


In [2]:
# prepare dataset
vocab = FreqVocab()
print("===========Load Sequence===========")
with open(args.data_dir + "eoa2seq_" + args.bizdate + ".pkl", "rb") as f:
    eoa2seq = pkl.load(f)

print("number of target user account:", len(eoa2seq))
vocab.update(eoa2seq)
# generate mapping
vocab.generate_vocab()

# save vocab
print("token_size:{}".format(len(vocab.vocab_words)))
vocab_file_name = args.data_dir + args.vocab_filename + "." + args.bizdate
print('vocab pickle file: ' + vocab_file_name)
with open(vocab_file_name, 'wb') as output_file:
    pkl.dump(vocab, output_file, protocol=2)

number of target user account: 592414
token_size:2381818
vocab pickle file: inter_data/vocab.2024


## Finetune

In [4]:
from pretrain.dataloader import FineTuneLoader
from models.model import FineTuneModel
from pretrain.trainer import PhishAccountTrainer
import pickle as pkl

In [5]:

args.bizdate = '2024'
args.num_epochs = 5
args.lr = 3e-4

vocab_file_name = args.data_dir + args.vocab_filename + "." + args.bizdate

with open(vocab_file_name, "rb") as vocab_file:
    vocab = pkl.load(vocab_file)
with open(args.data_dir + "eoa2seq_" + args.bizdate + ".pkl", "rb") as f:
    eoa2seq = pkl.load(f)
# dataloader
dataloader = FineTuneLoader(args, vocab, eoa2seq)
train_loader = dataloader.get_train_loader()

# model
model = FineTuneModel(args)

# tranier
trainer = PhishAccountTrainer(args, vocab, model, train_loader)
trainer.train()

model.pretrain_model.embedding.token_embed.weight torch.Size([3000000, 64]) torch.float32
model.pretrain_model.embedding.value_embed.weight torch.Size([15, 64]) torch.float32
model.pretrain_model.embedding.count_embed.weight torch.Size([15, 64]) torch.float32
model.pretrain_model.embedding.position_embed.weight torch.Size([100, 64]) torch.float32
model.pretrain_model.embedding.io_embed.weight torch.Size([3, 64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.0.weight torch.Size([64, 64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.0.bias torch.Size([64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.1.weight torch.Size([64, 64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.1.bias torch.Size([64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.2.weight torch.Size([64, 64]) torch.float32
model.pretrain_model.transformer

Epoch 1, Step 853, loss 0.003531 : 100%|██████████| 853/853 [02:57<00:00,  4.81it/s]


Saving model to: outputs/cpkt_local_phish/epoch_1.pth


Epoch 2, Step 1706, loss 0.004285 : 100%|██████████| 853/853 [02:44<00:00,  5.19it/s]
Epoch 3, Step 2559, loss 0.000323 : 100%|██████████| 853/853 [02:52<00:00,  4.95it/s]
Epoch 4, Step 3412, loss 0.001419 : 100%|██████████| 853/853 [03:17<00:00,  4.33it/s]
Epoch 5, Step 4265, loss 0.000079 : 100%|██████████| 853/853 [02:59<00:00,  4.74it/s]


Saving model to: outputs/cpkt_local_phish/epoch_5.pth


## Phish Account Testing

In [6]:
import numpy as np
from sklearn.metrics import classification_report
import  os
import torch

In [7]:

vocab_file_name = args.data_dir + args.vocab_filename + "." + args.bizdate

with open(vocab_file_name, "rb") as vocab_file:
    vocab = pkl.load(vocab_file)
with open(args.data_dir + "eoa2seq_" + args.bizdate + ".pkl", "rb") as f:
    eoa2seq = pkl.load(f)
# dataloader
dataloader = FineTuneLoader(args, vocab, eoa2seq)

# model
model = FineTuneModel(args)

# tranier
test_loader = dataloader.get_eval_loader()
trainer = PhishAccountTrainer(args, vocab, model, test_loader)

# load ckpt
ckpt_dir = args.ckpt_dir + "_phish"
content = os.listdir(ckpt_dir)
full_path = [os.path.join(ckpt_dir, x)  for x in content]
dir_content = sorted(full_path, key=lambda t: os.stat(t).st_mtime)
if not len(dir_content):
    raise FileNotFoundError("CKPT file for testing needed")

ckpt_dir = dir_content[-1]
print(f"load ckpt at: {ckpt_dir}")

trainer.model.load_state_dict(torch.load(ckpt_dir))

final_output, original_data = trainer.predict_proba(test_loader)
y_test_proba = np.concatenate(final_output)
y_test = np.concatenate(original_data)

for threshold in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]:
    print("threshold =", threshold)
    y_pred = np.zeros_like(y_test_proba)
    y_pred[np.where(np.array(y_test_proba) >= threshold)[0]] = 1
    print(np.sum(y_pred))
    print(classification_report(y_test, y_pred, digits=4))

model.pretrain_model.embedding.token_embed.weight torch.Size([3000000, 64]) torch.float32
model.pretrain_model.embedding.value_embed.weight torch.Size([15, 64]) torch.float32
model.pretrain_model.embedding.count_embed.weight torch.Size([15, 64]) torch.float32
model.pretrain_model.embedding.position_embed.weight torch.Size([100, 64]) torch.float32
model.pretrain_model.embedding.io_embed.weight torch.Size([3, 64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.0.weight torch.Size([64, 64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.0.bias torch.Size([64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.1.weight torch.Size([64, 64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.1.bias torch.Size([64]) torch.float32
model.pretrain_model.transformer_blocks.0.attention.linear_layers.2.weight torch.Size([64, 64]) torch.float32
model.pretrain_model.transformer

100%|██████████| 183/183 [00:31<00:00,  5.73it/s]


threshold = 0.1
1515.0
              precision    recall  f1-score   support

           0     0.9994    0.9988    0.9991    185637
           1     0.8475    0.9257    0.8849      1387

    accuracy                         0.9982    187024
   macro avg     0.9235    0.9622    0.9420    187024
weighted avg     0.9983    0.9982    0.9983    187024

threshold = 0.15
1473.0
              precision    recall  f1-score   support

           0     0.9994    0.9990    0.9992    185637
           1     0.8690    0.9229    0.8951      1387

    accuracy                         0.9984    187024
   macro avg     0.9342    0.9609    0.9471    187024
weighted avg     0.9985    0.9984    0.9984    187024

threshold = 0.2
1444.0
              precision    recall  f1-score   support

           0     0.9994    0.9991    0.9992    185637
           1     0.8809    0.9171    0.8986      1387

    accuracy                         0.9985    187024
   macro avg     0.9401    0.9581    0.9489    187024
weig

### Let us try one account

In [20]:
batch = next(iter(test_loader))
print(f"Length of batch {len(batch[0])}")

Length of batch 1024


In [61]:
# get one random account
# indx = np.random.randint(len(batch[0]))
indx = 1021
data = [x[indx].to('cuda') for x in batch]
print(data)

[tensor([358076], device='cuda:0'), tensor([ 358076, 2326570, 2326571,      33, 2326572,      10,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,       0,       0,       0,       0,
    

In [62]:
import torch.nn.functional as F

trainer.model.eval()
input_ids = data[1][None]
counts = data[2][None]
values = data[3][None]
io_flags = data[4][None]
positions = data[5][None]
logits = trainer.model(input_ids, counts, values, io_flags, positions)
y_prob = F.sigmoid(logits).detach().cpu().numpy()

In [63]:
print(f"probability of the prediction {y_prob}")
print(f"true label{data[-1].cpu().numpy()}")


probability of the prediction [[0.9999988]]
true label[1]
