In [1]:
import os
import sys
import numpy as np
import json
import nltk
import pandas as pd
import csv
import random
import logging
import tensorflow as tf
from collections import Counter
import pathlib
import pickle
import progressbar

#from tensorflow.python.keras.utils import Progbar

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

from text_preprocessing import tokenizer_word
from language_model_processing import read_raw_data_preprocess_and_save, create_vocab_df
from bpe import create_token_vocabulary, get_stats, merge_vocab, Encoder

In [2]:
DATASET_NAME = 'master'
INPUT_TYPE = 'txt' #Options: tokens, txt, csv
TO_SPLIT_CLITICS = True #Set to false if clitics already tokenized
DATASET_FILE_MAP = {'all': 'Social_pt.txt'}

if DATASET_FILE_MAP.get('all'):
    file_split = 'all'
else:
    file_split = 'split'

UNK_TOKEN = None #none if isnt one
NUM_MERGES = 30000 #VOCABULARY_SIZE = NUM_MERGES + N_BYTES (~1500)

mini_batch_size = 64

In [3]:
notebook_dir = pathlib.Path.cwd()
repo_dir = notebook_dir
(repo_dir / "models").mkdir(exist_ok = True)
(repo_dir / "models" / "base").mkdir(exist_ok = True)
dataset_dir = repo_dir / "datasets" / "base" / DATASET_NAME
models_dir = repo_dir / "models" / "base"
(models_dir / DATASET_NAME).mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "preprocessed_base_data").mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "processed_base_data").mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "processed_base_data" / "train").mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "processed_base_data" / "validate").mkdir(exist_ok = True)
(models_dir / DATASET_NAME / "pretraining_base_data").mkdir(exist_ok = True)
processed_data_dir = models_dir / DATASET_NAME / "processed_base_data"
pretraining_data_dir = models_dir / DATASET_NAME / "pretraining_base_data"
(models_dir / DATASET_NAME / "language_maps").mkdir(exist_ok = True)
language_maps_dir = repo_dir / "models" / "base" / DATASET_NAME / "language_maps"
(language_maps_dir).mkdir(exist_ok = True)
    
models_dir = models_dir / DATASET_NAME

def save_obj(obj, directory, name):
    with open(directory / "{}.pkl".format(name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name, directory):
    with open(os.path.join(directory, name + '.pkl'), 'rb') as f:
        return pickle.load(f)

# 1. Clean text and build tokenizer

In [None]:
read_raw_data_preprocess_and_save(dataset_file_map=DATASET_FILE_MAP, 
                                  models_dir=models_dir, 
                                  dataset_dir=dataset_dir,
                                  input_type=INPUT_TYPE,
                                  split_clitics=TO_SPLIT_CLITICS,
                                  remove_numbers=False,
                                  base_folder='preprocessed_base_data')

In [4]:
if file_split == 'all':
    preprocessed_training_data_dir = repo_dir / "models" / "base" / DATASET_NAME / "preprocessed_base_data"
else:
    preprocessed_training_data_dir = repo_dir / "models" / "base" / DATASET_NAME / "preprocessed_base_data" / "train"

corpus = []
MAX = None
if not MAX:
    MAX = len(os.listdir(preprocessed_training_data_dir))

with progressbar.ProgressBar(max_value=MAX) as bar:
    for i, file in enumerate(os.listdir(preprocessed_training_data_dir)):
        with open(os.path.join(preprocessed_training_data_dir, file), 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            words = list(reader)[0]
            corpus += words
        if i == MAX:
            break
        bar.update(i)
    
len(corpus)

100% (417 of 417) |######################| Elapsed Time: 0:00:08 Time:  0:00:08


41362485

In [5]:
# U+E000..U+F8FF is defined as a private use area so we use for space and unk
unk = '[UNK]'
spc = chr(int("E001", 16))
cls = '[CLS]'
sep = '[SEP]'
mask = '[MASK]'
pad = '[PAD]'

id_to_vocab = create_token_vocabulary()
unk_id = len(id_to_vocab)
spc_id = len(id_to_vocab) + 1
cls_id = len(id_to_vocab) + 2
sep_id = len(id_to_vocab) + 3
mask_id = len(id_to_vocab) + 4
pad_id = len(id_to_vocab) + 5

id_to_vocab[unk_id] = unk
id_to_vocab[spc_id] = spc
id_to_vocab[cls_id] = cls
id_to_vocab[sep_id] = sep
id_to_vocab[mask_id] = mask
id_to_vocab[pad_id] = pad

save_obj(id_to_vocab, language_maps_dir, "byte_decoder")

200
200
Raw size of emoji 3019
Raw size of emoji constituents 7255
Final size of emoji constituents 1324


In [6]:
vocab_to_id = {v: i for i, v in id_to_vocab.items()}
id_to_vocab = {i: v for v, i in vocab_to_id.items()}  # Reverse as the emoji and other characters have some overlap 
_ = vocab_to_id.pop(unk)

print("BPE vocab size:", len(vocab_to_id))

print('letter to id')
corpus = [[vocab_to_id.get(l, unk_id) if l is not " " else spc_id for l in word] for word in tokenizer_word(corpus,
                                                                                                            keep_phrases=False,
                                                                                                            tokenize_punc=True,
                                                                                                            split_clitics=True,
                                                                                                            keep_preceeding_space=True)]
print('id to letter')
corpus = [" ".join([id_to_vocab[l] for l in word]) for word in corpus]

count_dict = dict(Counter(corpus).most_common())

print("Total word vocab size", len(count_dict))
bpe_merges = []
vocab_to_id_current_max_id = sorted(list(vocab_to_id.values()))[-1]
with progressbar.ProgressBar(max_value=NUM_MERGES) as bar:
    for i in range(NUM_MERGES):
        vocab_to_id_current_max_id += 1
        pairs = get_stats(count_dict)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        bpe_merges.append(best)
        vocab_to_id["".join(best)] = vocab_to_id_current_max_id
        count_dict = merge_vocab(best, count_dict)
        bar.update(i)
id_to_vocab = {i: v for v, i in vocab_to_id.items()}
id_to_vocab[unk_id] = unk

save_obj(bpe_merges, language_maps_dir, "bpe_merges")
save_obj(id_to_vocab, language_maps_dir, "id_to_vocab")
save_obj(vocab_to_id, language_maps_dir, "vocab_to_id")
pd.DataFrame(list(vocab_to_id.keys())).to_csv(language_maps_dir / 'vocab_file.csv', encoding='utf-8', header=False, index=False, quoting=csv.QUOTE_NONE, escapechar='\\')

BPE vocab size: 1503
letter to id
id to letter
Total word vocab size 787349


100% (30000 of 30000) |#################| Elapsed Time: 22:53:53 Time: 22:53:53


In [7]:
testcase = " ".join(words[1000:1020])
bert_tokenizer = tokenization.FullTokenizer(language_maps_dir)
print(testcase)
print(bert_tokenizer.tokenize(testcase))

florete e no sabre . a espada só foi introduzida na edição seguinte , em 1900 . apenas nos jogos
['flor', 'ete', 'e', 'no', 'sabre', '.', 'a', 'espada', 'só', 'foi', 'introduzida', 'na', 'edição', 'seguinte', ',', 'em', '1900', '.', 'apenas', 'nos', 'jogos']


In [8]:
testcase = "Olá isso é mais uma BAGUNCA 😂😂😂"
print(testcase)
print(bert_tokenizer.tokenize(testcase))

Olá isso é mais uma BAGUNCA 😂😂😂
['olá', 'isso', 'é', 'mais', 'uma', 'bagun', 'ca', '😂', '😂', '😂']


# Prep data file

In [10]:
if file_split == 'all':
    preprocessed_training_data_dir = repo_dir / "models" / "base" / DATASET_NAME / "preprocessed_base_data"
else:
    preprocessed_training_data_dir = repo_dir / "models" / "base" / DATASET_NAME / "preprocessed_base_data" / "train"

corpus = []
MAX = None
if not MAX:
    MAX = len(os.listdir(preprocessed_training_data_dir))

with progressbar.ProgressBar(max_value=MAX) as bar:
    for i, file in enumerate(os.listdir(preprocessed_training_data_dir)):
        with open(os.path.join(preprocessed_training_data_dir, file), 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            words = list(reader)[0]
            corpus += words
        if i == MAX:
            break
        bar.update(i)
    


100% (417 of 417) |######################| Elapsed Time: 0:00:06 Time:  0:00:06


In [12]:
corpus[:3000]

['status',
 ',',
 'gender',
 ',',
 'and',
 'consumer',
 'nationalism',
 'in',
 'south',
 'korea',
 ',',
 'columbia',
 'university',
 'press',
 ',',
 '_tk_up_',
 'isbn',
 '0',
 '-',
 '231',
 '-',
 '11616',
 '-',
 '0',
 '\n',
 'yusuf',
 ',',
 'shahid',
 ';',
 'evenett',
 ',',
 'simon',
 'j',
 '.',
 ',',
 'wu',
 ',',
 'weiping',
 '.',
 '2001',
 'facets',
 'of',
 'globalization',
 'international',
 'and',
 'local',
 'dimensions',
 'of',
 'development',
 'world',
 'bank',
 'publications',
 ',',
 'pp',
 '.',
 '226',
 '227',
 '_tk_up_',
 'isbn',
 '0',
 '-',
 '8213',
 '-',
 '4742',
 '-',
 'x',
 '\n',
 'no',
 ',',
 'chŏng',
 '-',
 'hyŏn',
 '1993',
 'public',
 'administration',
 'and',
 'the',
 'korean',
 'transformation',
 'concepts',
 ',',
 'policies',
 ',',
 'and',
 'value',
 'conflicts',
 ',',
 'kumarian',
 'press',
 ',',
 '_tk_up_',
 'isbn',
 '1',
 '-',
 '56549',
 '-',
 '022',
 '-',
 '3',
 '\n',
 'dong',
 'pode',
 'se',
 'referir',
 'a',
 '\n',
 'dong',
 'moeda',
 '\n',
 'dong',
 'etnia',
 

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/usherwood/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import sent_tokenize
corp_str = TreebankWordDetokenizer().detokenize(corpus).replace(' .', '.')
for i in range(10):
    corp_str = corp_str.replace('. '+str(i), '.'+str(i))

corp_list = [x for x in sent_tokenize(corp_str) if x != '.']

In [17]:
corp_list[:31]

['status, gender, and consumer nationalism in south korea, columbia university press, _tk_up_ isbn 0 - 231 - 11616 - 0 \n yusuf, shahid; evenett, simon j., wu, weiping.2001 facets of globalization international and local dimensions of development world bank publications, pp.226 227 _tk_up_ isbn 0 - 8213 - 4742 - x \n no, chŏng - hyŏn 1993 public administration and the korean transformation concepts, policies, and value conflicts, kumarian press, _tk_up_ isbn 1 - 56549 - 022 - 3 \n dong pode se referir a \n dong moeda \n dong etnia \n dong divisão administrativa \n língua dong \n lago dongting, também chamado dong \n rrok kola mirdita klezna , 28 de setembro de 1939 é um arcebispo albanês.',
 'tendo nascido em 28 de setembro de 1939, foi ordenado sacerdote em nova iorque em 2 de julho de 1965, pelo mons.',
 'aleksandar tokić, arcebispo de bar, em montenegro, e foi um padre em uma paróquia albanesa no bronx, em nova iorque.',
 "em 1º de julho de 1986 o cardeal john joseph o 'connor, arce

In [18]:
print(corp_list[30])
print(bert_tokenizer.tokenize(corp_list[30]))

em 2012 foi nomeada presidente da comissão de direito eleitoral do instituto dos advogados do brasil 
 em 2010, o presidente da república luiz inácio lula da silva nomeou ana tereza basílio, ao cargo de juíza substituta do tribunal regional eleitoral do estado do rio de janeiro.
['em', '2012', 'foi', 'nomeada', 'presidente', 'da', 'comissão', 'de', 'direito', 'eleitoral', 'do', 'instituto', 'dos', 'advogados', 'do', 'brasil', 'em', '2010', ',', 'o', 'presidente', 'da', 'república', 'luiz', 'inácio', 'lula', 'da', 'silva', 'nomeou', 'ana', 'tere', 'za', 'basílio', ',', 'ao', 'cargo', 'de', 'juí', 'za', 'substitu', 'ta', 'do', 'tribunal', 'regional', 'eleitoral', 'do', 'estado', 'do', 'rio', 'de', 'janeiro', '.']


In [21]:
TRAIN_VAL_SPLIT = .1
train_size = int(len(corp_list)*TRAIN_VAL_SPLIT)

df_train = pd.DataFrame(corp_list[:train_size])
df_val = pd.DataFrame(corp_list[train_size:])
df_val.to_csv(processed_data_dir / "validate" / "val.csv", index=False, header=None, quoting=csv.QUOTE_MINIMAL, encoding='utf-8')

step = int(train_size/100)
for i in range(100):
    df_train[step*i:step*(i+1)].to_csv(processed_data_dir / "train" / "train{}.csv".format(i), index=False, header=None, quoting=csv.QUOTE_MINIMAL, encoding='utf-8')