In [2]:
import os
from pathlib import Path
import pandas as pd
import regex as re
from tqdm import tqdm

def load_wikitext2_from_parquet(parquet_path="../../train-00000-of-00001.parquet", text_column="text"):
    parquet_file = Path(parquet_path)
    
    if not parquet_file.exists():
        raise FileNotFoundError(f"Parquet file not found: {parquet_path}")
    
        
    df = pd.read_parquet(parquet_file)
    print(f"Loaded parquet file with {len(df)} rows using pandas")
    
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found. Available columns: {df.columns.tolist()}")
    

    texts = df[text_column].dropna().astype(str)
    texts = texts[texts.str.strip() != '']  
    text = '\n'.join(texts)
    
    return text
    
text = load_wikitext2_from_parquet()
print(f"\nLoaded {len(text)} characters")
print(f"First 500 chars:\n{text[:500]}")

Loaded parquet file with 36718 rows using pandas

Loaded 10916756 characters
First 500 chars:
 = Valkyria Chronicles III = 

 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs p


In [2]:
len(text)

10916756

In [2]:
SUBSET_SIZE = 0.1
subtext = text[:int(len(text) * SUBSET_SIZE)]

In [3]:
rule = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""")
pretokens = re.findall(rule, subtext)

pretokens[:11]

[' =',
 ' Valkyria',
 ' Chronicles',
 ' III',
 ' =',
 ' \n\n',
 ' Senjō',
 ' no',
 ' Valkyria',
 ' ',
 '3']

In [4]:
tst = "hello't this is a 1234554 test'll of a tokenizer made by ME'VE"
re.findall(rule, tst)

['hello',
 "'t",
 ' this',
 ' is',
 ' a',
 ' ',
 '123',
 '455',
 '4',
 ' test',
 "'ll",
 ' of',
 ' a',
 ' tokenizer',
 ' made',
 ' by',
 ' ME',
 "'VE"]

In [5]:
byteStream = [[ord(char) for char in token] for token in pretokens]
byteStream[:11]

[[32, 61],
 [32, 86, 97, 108, 107, 121, 114, 105, 97],
 [32, 67, 104, 114, 111, 110, 105, 99, 108, 101, 115],
 [32, 73, 73, 73],
 [32, 61],
 [32, 10, 10],
 [32, 83, 101, 110, 106, 333],
 [32, 110, 111],
 [32, 86, 97, 108, 107, 121, 114, 105, 97],
 [32],
 [51]]

In [6]:
def getStats(bts, stats):
    for pair in zip(bts, bts[1:]):
        stats[pair] = stats.get(pair, 0) + 1
    return stats

def merge(byteStream, pair, idx):
    i = 0
    newStream = []
    while i < len(byteStream):
        if i < (len(byteStream)-1) and byteStream[i] == pair[0] and byteStream[i+1] == pair[1]:
            newStream.append(idx)
            i += 2
        else:
            newStream.append(byteStream[i])
            i += 1
    return newStream



In [7]:
ids = [list(ch.encode('utf-8')) for ch in pretokens]
ids[:11]

[[32, 61],
 [32, 86, 97, 108, 107, 121, 114, 105, 97],
 [32, 67, 104, 114, 111, 110, 105, 99, 108, 101, 115],
 [32, 73, 73, 73],
 [32, 61],
 [32, 10, 10],
 [32, 83, 101, 110, 106, 197, 141],
 [32, 110, 111],
 [32, 86, 97, 108, 107, 121, 114, 105, 97],
 [32],
 [51]]

In [9]:
def bpe(pretokens, vocab_size=3000):
    num_merges = vocab_size - 256
    vocab = {idx: bytes([idx]) for idx in range(256)}
    merges = {}
    for i in tqdm(range(num_merges)):
        stats = {}
        for chunk in pretokens:
            getStats(chunk, stats)
        maxPair = max(stats, key=stats.get)
        idx = 256 + i
        merges[maxPair] = idx
        pretokens = [merge(chunk, maxPair, idx) for chunk in pretokens]
        vocab[idx] = vocab[maxPair[0]] + vocab[maxPair[1]]
            
    return vocab, merges

vocab, merges = bpe(ids)


100%|██████████| 2744/2744 [13:30<00:00,  3.39it/s]


In [10]:
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [11]:
test_str = "The goal of this test that I've mad is to see if the tokenizer works"
bytes = [bytes([ord(ch)]) for ch in test_str]

In [12]:
bytes

[b'T',
 b'h',
 b'e',
 b' ',
 b'g',
 b'o',
 b'a',
 b'l',
 b' ',
 b'o',
 b'f',
 b' ',
 b't',
 b'h',
 b'i',
 b's',
 b' ',
 b't',
 b'e',
 b's',
 b't',
 b' ',
 b't',
 b'h',
 b'a',
 b't',
 b' ',
 b'I',
 b"'",
 b'v',
 b'e',
 b' ',
 b'm',
 b'a',
 b'd',
 b' ',
 b'i',
 b's',
 b' ',
 b't',
 b'o',
 b' ',
 b's',
 b'e',
 b'e',
 b' ',
 b'i',
 b'f',
 b' ',
 b't',
 b'h',
 b'e',
 b' ',
 b't',
 b'o',
 b'k',
 b'e',
 b'n',
 b'i',
 b'z',
 b'e',
 b'r',
 b' ',
 b'w',
 b'o',
 b'r',
 b'k',
 b's']

In [13]:
def encode(text, vocab, merges):
    rule = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""")
    pretokens = re.findall(rule, text)
    merge_order = sorted(merges.items(), key=lambda x: x[1])
    
    token_ids = []
    
    for pretoken in pretokens:
        tokens = list(pretoken.encode('utf-8'))
        
        for (b1, b2), merge_idx in merge_order:
            new_tokens = []
            i =0
            while i < len(tokens):
                if i < len(tokens)-1 and tokens[i] == b1 and tokens[i+1] == b2:
                    new_tokens.append(merge_idx)
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            tokens = new_tokens
            
        token_ids.extend(tokens)
    return token_ids

test = encode(test_str, vocab, merges)

In [14]:
result = [vocab[token] for token in test]
result

[b'T',
 b'he',
 b' goal',
 b' of',
 b' this',
 b' test',
 b' that',
 b' I',
 b"'",
 b've',
 b' m',
 b'ad',
 b' is',
 b' to',
 b' see',
 b' if',
 b' the',
 b' to',
 b'ken',
 b'iz',
 b'er',
 b' works']

In [18]:
result[0].decode('utf-8')

'T'

In [None]:
vals = [res.decode('utf-8') for res in result]
val_str = ''.join(vals)


In [20]:
val_str

"The goal of this test that I've mad is to see if the tokenizer works"