In [34]:
import json
import pickle
from bpe import Encoder
import re

In [35]:
line_paths = "/media/data1/labels/SOREL_110000/SOREL_labels.ldjson"

In [36]:
def tokenize_label(label):
    """Split an AV label into a list of tokens."""

    tokens = re.split(r"[^0-9a-zA-Z]", label)
    tokens = [tok.lower().strip() for tok in tokens]
    tokens = [tok for tok in tokens if len(tok)]
    return tokens

In [37]:
test_corpus = []

with open(line_paths, 'r') as file:
    for line in file.readlines():
        report = json.loads(line)
        for av in report["scans"].keys():
            scan_info = report["scans"][av]
            if scan_info.get("result") is None:
                continue
            else:
                label = scan_info["result"]
                tokens = tokenize_label(label)
                test_corpus += tokens

string_test_corpus = ""
for i, token in enumerate(test_corpus):
    string_test_corpus += token + " "
    if i % 50 == 0:
        string_test_corpus += "\n"
    if i % 10000 == 0:
        print(f"processed {i} tokens")
# string_test_corpus = " ".join(test_corpus)

with open("test_corpus.txt", "w") as file:
    file.write(string_test_corpus)

print("first 50 characters of test_corpus:", string_test_corpus[:50])



processed 0 tokens
processed 10000 tokens
processed 20000 tokens
processed 30000 tokens
processed 40000 tokens
processed 50000 tokens
processed 60000 tokens
processed 70000 tokens
processed 80000 tokens
processed 90000 tokens
processed 100000 tokens
processed 110000 tokens
processed 120000 tokens
processed 130000 tokens
processed 140000 tokens
processed 150000 tokens
processed 160000 tokens
processed 170000 tokens
processed 180000 tokens
processed 190000 tokens
processed 200000 tokens
processed 210000 tokens
processed 220000 tokens
processed 230000 tokens
processed 240000 tokens
processed 250000 tokens
processed 260000 tokens
processed 270000 tokens
processed 280000 tokens
processed 290000 tokens
processed 300000 tokens
processed 310000 tokens
processed 320000 tokens
processed 330000 tokens
processed 340000 tokens
processed 350000 tokens
processed 360000 tokens
processed 370000 tokens
processed 380000 tokens
processed 390000 tokens
processed 400000 tokens
processed 410000 tokens
proces

In [38]:
with open("test_corpus.txt", "r") as file:
    string_test_corpus = file.read()

In [39]:
encoder = Encoder(pct_bpe=1.05, silent=True)

# encoder = Encoder(1000, pct_bpe=1.1, silent=False)

# encoder.fit(string_test_corpus[:6000].split('\n'))
encoder.fit(string_test_corpus.split('\n'))

In [40]:
print("bpe vocab:", encoder.bpe_vocab)
print("bpe vocab size:", len(encoder.bpe_vocab))
print("word vocab:", encoder.word_vocab)
print("word vocab size:", len(encoder.word_vocab))
print("vocab size:", encoder.vocab_size)
print("test_corpus size:", len(string_test_corpus))

print(encoder.tokenize(string_test_corpus[:40]))


bpe vocab: {'__sow': 2, '__eow': 3, 'a': 4, 'n': 5, 'r': 6, 'i': 7, 'e': 8, 'o': 9, 't': 10, 'c': 11, 'w': 12, 's': 13, '2': 14, '3': 15, 'd': 16, 'l': 17, 'g': 18, 'in': 19, 'm': 20, 'u': 21, '32': 22, 'en': 23, 'j': 24, 'tr': 25, 'an': 26, 'ro': 27, 'wi': 28, 'ge': 29, 'p': 30, 'n3': 31, '0': 32, 'oj': 33, 'b': 34, 'ja': 35, 'v': 36, '1': 37, 'f': 38, 'ic': 39, 'er': 40, 'k': 41, 'h': 42, 'al': 43, 'ri': 44, 'ar': 45, 'us': 46, 're': 47, 'ma': 48, '4': 49, 'ne': 50, '8': 51, '5': 52, 'or': 53, 'y': 54, 'li': 55, '6': 56, 'vi': 57, '9': 58, '7': 59, 'ad': 60, 'wa': 61, 'ir': 62, 'nt': 63, 'de': 64, 'do': 65, 'io': 66, 'ck': 67, 'lo': 68, 'co': 69, 'w3': 70, 'ou': 71, 'ci': 72, 'ru': 73, '00': 74, 'ra': 75, 'on': 76, 'ti': 77, 'ag': 78, 'ur': 79, 'st': 80, 'z': 81, 'ns': 82, 'lw': 83, 'at': 84, 'he': 85, 'ac': 86, 'x': 87, 'is': 88, 'fi': 89, 'id': 90, 'ce': 91, 'oo': 92, 'eu': 93, 'ia': 94, 'wo': 95, 'nf': 96, 'nc': 97, 'un': 98, 'to': 99, 'el': 100, 'va': 101, 'q': 102, 'pe': 103, 's

In [41]:
print(type(string_test_corpus))

<class 'str'>


In [42]:
with open('encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

In [43]:
with open('encoder.pkl', 'rb') as file:
    encoder = pickle.load(file)

print(encoder.tokenize(string_test_corpus[:40]))

['__sow', 'w3', '2', '__eow', '__sow', 'fa', 'mv', 't', '__eow', '__sow', 'sy', 'tr', 'o', '__eow', '__sow', 'wo', 'rm', '__eow', '__sow', 'ge', 'ne', 'ri', 'c', '__eow', '__sow', 'ma', 'lw', 'ar', 'e', '__eow', '__sow', 'sn', '__eow']


In [44]:
print("bpe vocab:", encoder.bpe_vocab)
print("bpe vocab size:", len(encoder.bpe_vocab))
print("word vocab:", encoder.word_vocab)
print("word vocab size:", len(encoder.word_vocab))
print("vocab size:", encoder.vocab_size)
print("test_corpus size:", len(string_test_corpus))

bpe vocab: {'__sow': 2, '__eow': 3, 'a': 4, 'n': 5, 'r': 6, 'i': 7, 'e': 8, 'o': 9, 't': 10, 'c': 11, 'w': 12, 's': 13, '2': 14, '3': 15, 'd': 16, 'l': 17, 'g': 18, 'in': 19, 'm': 20, 'u': 21, '32': 22, 'en': 23, 'j': 24, 'tr': 25, 'an': 26, 'ro': 27, 'wi': 28, 'ge': 29, 'p': 30, 'n3': 31, '0': 32, 'oj': 33, 'b': 34, 'ja': 35, 'v': 36, '1': 37, 'f': 38, 'ic': 39, 'er': 40, 'k': 41, 'h': 42, 'al': 43, 'ri': 44, 'ar': 45, 'us': 46, 're': 47, 'ma': 48, '4': 49, 'ne': 50, '8': 51, '5': 52, 'or': 53, 'y': 54, 'li': 55, '6': 56, 'vi': 57, '9': 58, '7': 59, 'ad': 60, 'wa': 61, 'ir': 62, 'nt': 63, 'de': 64, 'do': 65, 'io': 66, 'ck': 67, 'lo': 68, 'co': 69, 'w3': 70, 'ou': 71, 'ci': 72, 'ru': 73, '00': 74, 'ra': 75, 'on': 76, 'ti': 77, 'ag': 78, 'ur': 79, 'st': 80, 'z': 81, 'ns': 82, 'lw': 83, 'at': 84, 'he': 85, 'ac': 86, 'x': 87, 'is': 88, 'fi': 89, 'id': 90, 'ce': 91, 'oo': 92, 'eu': 93, 'ia': 94, 'wo': 95, 'nf': 96, 'nc': 97, 'un': 98, 'to': 99, 'el': 100, 'va': 101, 'q': 102, 'pe': 103, 's