In [2]:
import os
os.chdir('/content/drive/MyDrive/keep/git/TinyBERT')

In [5]:
# !pip install wikiextractor --target=$nb_path


In [2]:
import sys, os

nb_path = '/content/TinyBERT'
# os.symlink(lib_path, nb_path)
sys.path.insert(0,nb_path)


In [4]:
import json
import collections
import logging
import os
import shelve
from argparse import ArgumentParser
from pathlib import Path
from tqdm import tqdm, trange
from tempfile import TemporaryDirectory
from multiprocessing import Pool

import numpy as np
from random import random, randrange, randint, shuffle, choice

from transformer.tokenization import BertTokenizer
from nltk.tokenize import sent_tokenize

In [9]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [10]:

class DocumentDatabase:
    def __init__(self, reduce_memory=False):
        if reduce_memory:
            self.temp_dir = TemporaryDirectory()
            self.working_dir = Path(self.temp_dir.name)
            self.document_shelf_filepath = self.working_dir / 'shelf.db'
            self.document_shelf = shelve.open('shelf.db',
                                              flag='c', protocol=-1)
            self.documents = None
        else:
            self.documents = []
            self.document_shelf = None
            self.document_shelf_filepath = None
            self.temp_dir = None
        self.doc_lengths = []
        self.doc_cumsum = None
        self.cumsum_max = None
        self.reduce_memory = reduce_memory

    def add_document(self, document):
        if not document:
            return
        if self.reduce_memory:
            current_idx = len(self.doc_lengths)
            self.document_shelf[str(current_idx)] = document
        else:
            self.documents.append(document)
        self.doc_lengths.append(len(document))

    def _precalculate_doc_weights(self):
        self.doc_cumsum = np.cumsum(self.doc_lengths)
        self.cumsum_max = self.doc_cumsum[-1]

    def sample_doc(self, current_idx, sentence_weighted=True):
        # Uses the current iteration counter to ensure we don't sample the same doc twice
        if sentence_weighted:
            # With sentence weighting, we sample docs proportionally to their sentence length
            if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
                self._precalculate_doc_weights()
            rand_start = self.doc_cumsum[current_idx]
            rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
            sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
            sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
        else:
            # If we don't use sentence weighting, then every doc has an equal chance to be chosen
            sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
        assert sampled_doc_index != current_idx
        if self.reduce_memory:
            return self.document_shelf[str(sampled_doc_index)]
        else:
            return self.documents[sampled_doc_index]

    def __len__(self):
        return len(self.doc_lengths)

    def __getitem__(self, item):
        if self.reduce_memory:
            return self.document_shelf[str(item)]
        else:
            return self.documents[item]

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, traceback):
        if self.document_shelf is not None:
            self.document_shelf.close()
        if self.temp_dir is not None:
            self.temp_dir.cleanup()


def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
    """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_num_tokens:
            break

        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
        assert len(trunc_tokens) >= 1

        # We want to sometimes truncate from the front and sometimes from the
        # back to add more randomness and avoid biases.
        if random() < 0.5:
            del trunc_tokens[0]
        else:
            trunc_tokens.pop()

In [11]:
def create_instances_from_document(
        doc_database, doc_idx, max_seq_length, short_seq_prob,
        masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list, bi_text=True):
    """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
    However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
    Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
    (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task."""
    document = doc_database[doc_idx]
    # Account for [CLS], [SEP], [SEP]
    max_num_tokens = max_seq_length - 3

    # We *usually* want to fill up the entire sequence since we are padding
    # to `max_seq_length` anyways, so short sequences are generally wasted
    # computation. However, we *sometimes*
    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
    # sequences to minimize the mismatch between pre-training and fine-tuning.
    # The `target_seq_length` is just a rough target however, whereas
    # `max_seq_length` is a hard limit.
    target_seq_length = max_num_tokens
    if random() < short_seq_prob:
        target_seq_length = randint(2, max_num_tokens)

    # We DON'T just concatenate all of the tokens from a document into a long
    # sequence and choose an arbitrary split point because this would make the
    # next sentence prediction task too easy. Instead, we split the input into
    # segments "A" and "B" based on the actual "sentences" provided by the user
    # input.
    instances = []
    current_chunk = []
    current_length = 0
    i = 0
    while i < len(document):
        segment = document[i]
        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(document) - 1 or current_length >= target_seq_length:
            if current_chunk:
                # `a_end` is how many segments from `current_chunk` go into the `A`
                # (first) sentence.
                a_end = 1
                if len(current_chunk) >= 2:
                    a_end = randrange(1, len(current_chunk))

                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend(current_chunk[j])

                tokens_b = []

                # Random next
                if bi_text and (len(current_chunk) == 1 or random() < 0.5) :
                    is_random_next = True
                    target_b_length = target_seq_length - len(tokens_a)

                    # Sample a random document, with longer docs being sampled more frequently
                    random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)

                    random_start = randrange(0, len(random_document))
                    for j in range(random_start, len(random_document)):
                        tokens_b.extend(random_document[j])
                        if len(tokens_b) >= target_b_length:
                            break
                    # We didn't actually use these segments so we "put them back" so
                    # they don't go to waste.
                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                # Actual next
                else:
                    is_random_next = False
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])

                if not tokens_a or len(tokens_a) == 0:
                    tokens_a = ["."]

                if not tokens_b or len(tokens_b) == 0:
                    tokens_b = ["."]

                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

                tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
                # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]
                # They are 1 for the B tokens and the final [SEP]
                segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]

                tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
                    tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)

                instance = {
                    "tokens": tokens,
                    "segment_ids": segment_ids,
                    "is_random_next": is_random_next,
                    "masked_lm_positions": masked_lm_positions,
                    "masked_lm_labels": masked_lm_labels}

                instances.append(instance)
            current_chunk = []
            current_length = 0
        i += 1

    return instances

In [12]:
def create_training_file(docs, vocab_list, args, epoch_num, bi_text=True):
    epoch_filename = args.output_dir / "epoch_{}.json".format(epoch_num)
    num_instances = 0
    with epoch_filename.open('w') as epoch_file:
        for doc_idx in trange(len(docs), desc="Document"):
            doc_instances = create_instances_from_document(
                docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
                masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
                whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list, bi_text=bi_text)
            doc_instances = [json.dumps(instance) for instance in doc_instances]
            for instance in doc_instances:
                epoch_file.write(instance + '\n')
                num_instances += 1
    metrics_filename = args.output_dir / "epoch_{}_metrics.json".format(epoch_num)
    with metrics_filename.open('w') as metrics_file:
        metrics = {
            "num_training_examples": num_instances,
            "max_seq_len": args.max_seq_len
        }
        metrics_file.write(json.dumps(metrics))

    return epoch_filename, metrics_filename

In [8]:
bert_model = "models/bert-base-uncased"
do_lower_case = True
reduce_memory = True
train_corpus = Path("/content/drive/MyDrive/keep/datasets/text/AA/wiki_00")
output_dir = Path("test")

In [39]:
from datasets import load_dataset
wiki_dataset = load_dataset("wikipedia", "20220301.en", cache_dir='/content/drive/MyDrive/keep/datasets/huggingface/wiki_en' )

Reusing dataset wikipedia (/content/drive/MyDrive/keep/datasets/huggingface/wiki_en/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [105]:
# import nltk.data
import nltk
nltk.download('punkt')
# sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# extra_abbreviations = ['etc']
# sent_tokenizer._params.abbrev_types.update(extra_abbreviations)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [132]:
# from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
# punkt_param = PunktParameters()
# abbreviation = ['etc']
# punkt_param.abbrev_types = set(abbreviation)
# sent_tokenizer = PunktSentenceTokenizer(punkt_param)
# sent_tokenizer.tokenize('I am good at football, (baseball, etc.), but tudent can have.')

['I am good at football, (baseball, etc.), but tudent can have.']

In [20]:
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
vocab_list = list(tokenizer.vocab.keys())
doc_num = 0
with DocumentDatabase(reduce_memory=reduce_memory) as docs:
    with train_corpus.open() as f:
        doc = []
        for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
            line = line.strip()
            if line == "":
                docs.add_document(doc)
                doc = []
                doc_num += 1
                if doc_num % 100 == 0:
                    logger.info('loaded {} docs!'.format(doc_num))
            else:
                tokens = tokenizer.tokenize(line)
                doc.append(tokens)
        if doc:
            docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
    if len(docs) <= 1:
        exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
                "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
                "indicate breaks between documents in your input file. If your dataset does not contain multiple "
                "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
                "sections or paragraphs.")

    output_dir.mkdir(exist_ok=True)

In [23]:
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
vocab_list = list(tokenizer.vocab.keys())
doc_num = 0
with DocumentDatabase(reduce_memory=reduce_memory) as docs:
    for sample in tqdm(wiki_dataset['train']):
        doc_raw = sample['text']
        doc = []
        doc_raw = doc_raw.splitlines()
        for paragraph in filter(None,doc_raw):
            lines = sent_tokenize(paragraph)
            for line in lines:
                # tokens = tokenizer.tokenize(line)
                doc.append(line)
        # docs.add_document(doc)
        # doc_num += 1
        # if doc_num % 100 == 0:
        #     logger.info('loaded {} docs!'.format(doc_num))
    

  0%|          | 0/6458670 [00:13<?, ?it/s]


TypeError: expected string or bytes-like object

In [5]:
with open('test.txt', 'w') as f:
    for sample in tqdm(wiki_dataset['train']):
        doc_raw = sample['text']
        doc_raw = doc_raw.splitlines()
        # iter_doc = filter(None,doc_raw)
        # next(iter_doc)
        for paragraph in filter(None,doc_raw):
            lines = sent_tokenize(paragraph)
            # break
            f.write('\n'.join(lines))
            # for line in lines:
            #     f.write(line)
        # break


NameError: name 'wiki_dataset' is not defined

In [110]:
sample['text'].decode('utf-8')

AttributeError: 'str' object has no attribute 'decode'

In [103]:
sent_tokenizer.tokenize(sample['text'])

['Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy.',
 'Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful.',
 'As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.',
 'Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires.',
 'With the rise of organised hierarchical bodies, scepticism toward authority also rose.',
 'Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment.',
 "During the latter half of the 19th and the first decades of the 20th century, the anarchis

In [79]:
for i in filter(sent_tokenizer.tokenize,list(filter(None,doc_raw))):
    print(i)

TypeError: tokenize() missing 1 required positional argument: 'text'

In [96]:
with open('test.txt', 'r', encoding='utf-8') as f:
    a=f.readlines()

In [97]:
a

['Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy.\n',
 'Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful.\n',
 'As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires.\n',
 'With the rise of organised hierarchical bodies, scepticism toward authority also rose.\n',
 'Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment.\n',
 "During the latter half of the 19th and the first decades of the 20th century, the ana

In [84]:
import collections

In [86]:
collections.deque(iter_doc)

deque([])

In [47]:
split = wiki_dataset['train'][6]['text'].splitlines()

In [48]:
for paragraph in filter(None,split):
    lines_para = tokenizer.tokenize(paragraph)
    for line in lines_para:
        print(line)
    break

    


    

Abraham Lincoln (; February 12, 1809 – April 15, 1865) was an American lawyer and statesman who served as the 16th president of the United States from 1861 until his assassination in 1865.
Lincoln led the nation through the American Civil War and succeeded in preserving the Union, abolishing slavery, bolstering the federal government, and modernizing the U.S. economy.


In [None]:
[x for x in split if x]

In [9]:
from bs4 import BeautifulSoup
with open(train_corpus, 'r') as f:
    contents = f.read()

    soup = BeautifulSoup(contents, 'lxml')

In [None]:
for child in soup.recursiveChildGenerator():
    print(child.text)

In [None]:
soup.find_all('doc')[11].text

In [34]:
with train_corpus.open() as f:
    a=f.readlines()

In [37]:
a[10]

'Anarchism employs a diversity of tactics in order to meet its ideal ends which can be broadly separated into revolutionary and evolutionary tactics; there is significant overlap between the two, which are merely descriptive. Revolutionary tactics aim to bring down authority and state, having taken a violent turn in the past, while evolutionary tactics aim to prefigure what an anarchist society would be like. Anarchist thought, criticism, and praxis have played a part in diverse areas of human society. Criticism of anarchism include claims that it is internally inconsistent, violent, or utopian.\n'

### BookCorpus

In [1]:
from datasets import load_dataset
d = load_dataset('bookcorpusopen', split="train", cache_dir='/content/drive/MyDrive/keep/datasets/huggingface/bookcorpusopen')

ModuleNotFoundError: No module named 'datasets'