# ULM Preprocessing: BPE

This file is to preprocess and text file for use in the ULM, note this notebook is for the full emoji BPE implamentation

In [None]:
from collections import Counter
from itertools import chain
import pickle
import pathlib
import os
import pathlib
import math

import pandas as pd
import numpy as np

import csv
import pathlib
import os
import progressbar

from text_preprocessing import tokenizer_word
from language_model_processing import read_raw_data_preprocess_and_save, create_vocab_df
from bpe import create_token_vocabulary, get_stats, merge_vocab, Encoder

The following inputs are required from the user:
- The name of the dataset as stored in datasets/
- The input type, options include tokens, txt or csv. Note this affects more than just the file type. TODO
- DATASET_FILE_MAP which dictates if the data is pre-split into train/val/test, if so we will keep the datasets like this, if not we split the master corpus into the subsets

In [None]:
DATASET_NAME = "wikitext-2"
INPUT_TYPE = 'tokens' #Options: tokens, txt, csv
TO_SPLIT_CLITICS = False #Set to false if clitics already tokenized
DATASET_FILE_MAP = {'all': None,
                    'train': 'wiki.train.tokens', 'validate': 'wiki.valid.tokens', 'test': 'wiki.test.tokens'}

if DATASET_FILE_MAP.get('all'):
    file_split = 'all'
else:
    file_split = 'split'

UNK_TOKEN = "_unk_" #none if isnt one
SEQUENCE_LENGTH = 20
NUM_MERGES = 50 #VOCABULARY_SIZE = NUM_MERGES + N_BYTES (~1500)

mini_batch_size = 64
N_SPLITS = 10 #of training data on disk, this is purely a question of resource efficiency

# Preproccesing
Here we split the raw data into a set of small csvs with the data tokenized to the form that we need. If there is a pre-definition on train/test/val we will put them in the corresponding folders here, if not at the preprocessing stage everything will be dumped into asingle "all folder".

The tokenization occuring here 

In [None]:
cleaned_tokens = []
zx = 0

#An attempt to make this platform agnostic
notebook_dir = pathlib.Path.cwd()
repo_dir = notebook_dir.parent
(repo_dir / "models").mkdir(exist_ok = True)
(repo_dir / "models" / "ULM").mkdir(exist_ok = True)
dataset_dir = repo_dir / "datasets" / "ULM" / DATASET_NAME
models_dir = repo_dir / "models" / "ULM"
(models_dir / DATASET_NAME).mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "preprocessed_ulm_data").mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "language_maps").mkdir(exist_ok = True)
    
models_dir = models_dir / DATASET_NAME
    
read_raw_data_preprocess_and_save(dataset_file_map=DATASET_FILE_MAP, 
                                  models_dir=models_dir, 
                                  dataset_dir=dataset_dir,
                                  input_type=INPUT_TYPE,
                                  split_clitics=TO_SPLIT_CLITICS,
                                  remove_numbers=False)

# Create Vocab

In [None]:
notebook_dir = pathlib.Path.cwd()
repo_dir = notebook_dir.parent

if file_split == 'all':
    preprocessed_training_data_dir = repo_dir / "models" / "ULM" / DATASET_NAME / "preprocessed_ulm_data"
else:
    preprocessed_training_data_dir = repo_dir / "models" / "ULM" / DATASET_NAME / "preprocessed_ulm_data" / "train"

for file in os.listdir(preprocessed_training_data_dir):
    with open(os.path.join(preprocessed_training_data_dir, file), 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        words = list(reader)[0]
    break

In [None]:
corpus = words

In [None]:
# U+E000..U+F8FF is defined as a private use area so we use for space and unk
unk = chr(int("E000", 16))
spc = chr(int("E001", 16))

language_maps_dir = repo_dir / "models" / "ULM" / DATASET_NAME / "language_maps"
(language_maps_dir).mkdir(exist_ok = True)

def save_obj(obj, directory, name):
    with open(directory / "{}.pkl".format(name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

id_to_vocab = create_token_vocabulary()
unk_id = len(id_to_vocab)
spc_id = len(id_to_vocab) + 1

id_to_vocab[unk_id] = unk
id_to_vocab[spc_id] = spc

save_obj(id_to_vocab, language_maps_dir, "byte_decoder")

vocab_to_id = {v: i for i, v in id_to_vocab.items()}
id_to_vocab = {i: v for v, i in vocab_to_id.items()}  # Reverse as the emoji and other characters have some overlap 
_ = vocab_to_id.pop(unk)

print("BPE vocab size:", len(vocab_to_id))

corpus = tokenizer_word(corpus,
                        keep_phrases=False,
                        tokenize_punc=True,
                        split_clitics=True,
                        keep_preceeding_space=True)

corpus_ids = [[vocab_to_id.get(l, unk_id) if l is not " " else spc_id for l in word] for word in corpus]
corpus = [" ".join([id_to_vocab[l] for l in word]) for word in corpus_ids]

count_dict = Counter(corpus)
vocab_df = pd.DataFrame(np.array([list(dict(count_dict).keys()), list(dict(count_dict).values())]).T,
                        columns=['Word', 'Freq'])
vocab_df['Freq'] = vocab_df['Freq'].astype(np.float64)
print("Total word vocab size", len(vocab_df))
df_dict = vocab_df.sort_values(by=['Freq'], ascending=False).to_dict("records")
df_dict = {item["Word"]: item["Freq"] for item in df_dict}

bpe_merges = []
vocab_to_id_current_max_id = sorted(list(vocab_to_id.values()))[-1]
with progressbar.ProgressBar(max_value=NUM_MERGES) as bar:
    for i in range(NUM_MERGES):
        vocab_to_id_current_max_id += 1
        pairs = get_stats(df_dict)
        best = max(pairs, key=pairs.get)
        bpe_merges.append(best)
        vocab_to_id["".join(best)] = vocab_to_id_current_max_id
        df_dict = merge_vocab(best, df_dict)
        bar.update(i)
id_to_vocab = {i: v for v, i in vocab_to_id.items()}
id_to_vocab[unk_id] = unk

save_obj(bpe_merges, language_maps_dir, "bpe_merges")
save_obj(id_to_vocab, language_maps_dir, "id_to_vocab")
save_obj(vocab_to_id, language_maps_dir, "vocab_to_id")

In [None]:
language_maps_dir = repo_dir / "models" / "ULM" / DATASET_NAME / "language_maps"
processed_data_dir = language_maps_dir.parent / "processed_ulm_data"
processed_data_dir.mkdir(exist_ok = True)
training_data_dir = processed_data_dir / "train"
test_data_dir = processed_data_dir / "test"
validation_data_dir = processed_data_dir / "validate"
training_data_dir.mkdir(exist_ok = True)
test_data_dir.mkdir(exist_ok = True)
validation_data_dir.mkdir(exist_ok = True)

def load_obj(name, directory):
    with open(os.path.join(directory, name + '.pkl'), 'rb') as f:
        return pickle.load(f)
    
byte_decoder = load_obj("byte_decoder", language_maps_dir)
id_to_vocab = load_obj("id_to_vocab", language_maps_dir)
vocab_to_id = load_obj("vocab_to_id", language_maps_dir)
bpe_merges = load_obj("bpe_merges", language_maps_dir)

VOCABULARY_SIZE = len(id_to_vocab)
print("VOCABULARY_SIZE:", VOCABULARY_SIZE)

(training_data_dir / 'X').mkdir(exist_ok = True)
(test_data_dir / 'X').mkdir(exist_ok = True)
(validation_data_dir / 'X').mkdir(exist_ok = True)
(training_data_dir / 'Y').mkdir(exist_ok = True)
(test_data_dir / 'Y').mkdir(exist_ok = True)
(validation_data_dir / 'Y').mkdir(exist_ok = True)

# Indexify and Split Data

The sections below create the samples (sequences + the next word), then shuffle them randomly, if file_split='all' then split into train, test and validate groupings. These are then saved into files (split across N_SPLITS files) in the relevant directory.

In [None]:
def create_samples(data, sequence_length, stride_length=1):
    return [(data[x:x+sequence_length], data[x+sequence_length]) for x in range(0, len(data)-sequence_length, stride_length)]

def save_split_object(obj, directory, name, splits, mini_batch_size):
    
    actual_split_sizes = []
    
    if(len(obj)<splits):
        raise ValueError("Too few items for number of splits")
    ideal_total_per_split = math.floor(len(obj)/splits)
    remainder = 0
    for i in range(splits):
        split_total = math.floor((ideal_total_per_split+remainder)/mini_batch_size) * mini_batch_size
        if (i+1)*split_total > len(obj):
            split_total -= mini_batch_size
        remainder = ideal_total_per_split - split_total
        lower = i*split_total
        upper = (i+1)*split_total
        save_obj(obj[lower:upper], directory, "{}_{}".format(name,i))
        actual_split_sizes += [split_total]
        
    save_obj(actual_split_sizes, directory.parent, name+'_batch_sizes')
    
encoder = Encoder(
                  encoder=vocab_to_id,
                  bpe_merges=bpe_merges,
                  byte_decoder=byte_decoder,
                  unk_token=chr(int("E000", 16)),
                  unk_id=len(byte_decoder)-2,
                  spc_token=chr(int("E001", 16)),
                  spc_id=len(byte_decoder)-1)

In [None]:
x = encoder.encode("Hi, i am a data scientist and I love emoji 😎🤢 我")
print(x)
for a in x:
    print(encoder.decode([a]))
print(encoder.decode(x))

In [None]:
for dataset_key in DATASET_FILE_MAP.keys():
    
    if file_split == 'all':
        preprocessed_data_dir = models_dir / "preprocessed_ulm_data"
    else:
        preprocessed_data_dir = models_dir / "preprocessed_ulm_data" / dataset_key
    
    words = []
    for file in os.listdir(preprocessed_data_dir):
        with open(os.path.join(preprocessed_data_dir, file), 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            words += list(reader)[0]
    print('Total number of words:', len(words))

    data = encoder.encode(" ".join(words))

    #Shuffle and split data into train, test, validate
    zipped_samples = create_samples(data, SEQUENCE_LENGTH)
    np.random.shuffle(zipped_samples)

    train_size = 0.7
    test_size = 0.2
    #validate_size is balance

    if file_split == 'all':
        total_samples = len(zipped_samples)
        train_cutoff = math.floor(total_samples*train_size)
        test_cutoff = math.floor(total_samples*test_size)
        X_train, Y_train = zip(*zipped_samples[0:train_cutoff])
        X_test, Y_test = zip(*zipped_samples[train_cutoff:train_cutoff + test_cutoff])
        X_validate, Y_validate = zip(*zipped_samples[train_cutoff + test_cutoff:])

        #test case
        testing_index = np.random.randint(0, len(X_train))
        print(X_train[testing_index],
              Y_train[testing_index],
              [id_to_vocab[tok] for tok in X_train[testing_index]],
              id_to_vocab[Y_train[testing_index]])

        #Save data, split up into chunks
        save_split_object(X_train, training_data_dir / 'X', 'X', N_SPLITS, mini_batch_size)
        save_split_object(Y_train, training_data_dir / 'Y', 'Y', N_SPLITS, mini_batch_size)

        save_split_object(X_test, test_data_dir / 'X', 'X', N_SPLITS, mini_batch_size)
        save_split_object(Y_test, test_data_dir / 'Y', 'Y', N_SPLITS, mini_batch_size)

        save_split_object(X_validate, validation_data_dir / 'X', 'X', N_SPLITS, mini_batch_size)
        save_split_object(Y_validate, validation_data_dir / 'Y', 'Y', N_SPLITS, mini_batch_size)
        
    else:
        X, Y= zip(*zipped_samples)
        
        #test case
        testing_index = np.random.randint(0, len(X))
        print(X[testing_index],
              Y[testing_index],
              [id_to_vocab[tok] for tok in X[testing_index]],
              id_to_vocab[Y[testing_index]])

        #Save data, split up into chunks
        save_split_object(X, processed_data_dir / dataset_key / 'X', 'X', N_SPLITS, mini_batch_size)
        save_split_object(Y, processed_data_dir / dataset_key / 'Y', 'Y', N_SPLITS, mini_batch_size)