In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/duy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/duy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## load data

In [2]:
def load_data(fname):
    return pd.read_feather(fname)

def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>')
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

def break_into_sentences(paragraph):
    sentences = list()
    temp_sentence = list()
    flag = False
    for ch in paragraph.strip():
        if ch in [u'؟', u'!', u'.', u':', u'؛']:
            flag = True
        elif flag:
            sentences.append(''.join(temp_sentence).strip())
            temp_sentence = []
            flag = False

        temp_sentence.append(ch)

    else:
        sentences.append(''.join(temp_sentence).strip())
        return sentences

def remove_ref(sentence):
    result = re.sub("(\[\d\])", "", sentence)
    return result

def clean_arabic(l_arabic):
    l_cleaned_arabic = []
    for p in l_arabic:
        ss = break_into_sentences(remove_ref(cleanhtml(p)))
        for s in ss:
            l_cleaned_arabic.append(s)
    return l_cleaned_arabic

def get_cleaned_sentence_list(l):
    cleaned = clean_arabic(l)
    res = []
    for i in l:
        words = word_tokenize(i)
        sent = ' '.join(words)
        res.append(sent)
    return res

In [3]:
from __future__ import annotations

from collections import Counter, defaultdict
from typing import Iterable, Tuple, Union


class Vocab(object):
    r"""
    Defines a vocabulary object that will be used to numericalize a field.

    Args:
        counter (~collections.Counter):
            :class:`~collections.Counter` object holding the frequencies of each value found in the data.
        min_freq (int):
            The minimum frequency needed to include a token in the vocabulary. Default: 1.
        specials (Tuple[str]):
            The list of special tokens (e.g., pad, unk, bos and eos) that will be prepended to the vocabulary. Default: ``[]``.
        unk_index (int):
            The index of unk token. Default: 0.

    Attributes:
        itos:
            A list of token strings indexed by their numerical identifiers.
        stoi:
            A :class:`~collections.defaultdict` object mapping token strings to numerical identifiers.
    """

    def __init__(self, counter: Counter, min_freq: int = 1, specials: Tuple = tuple(), unk_index: int = 0) -> Vocab:
        self.itos = list(specials)
        self.stoi = defaultdict(lambda: unk_index)
        self.stoi.update({token: i for i, token in enumerate(self.itos)})
        self.update([token for token, freq in counter.items() if freq >= min_freq])
        self.unk_index = unk_index
        self.n_init = len(self)

    def __len__(self):
        return len(self.itos)

    def __getitem__(self, key: Union[int, str, Iterable]) -> Union[str, int, Iterable]:
        if isinstance(key, str):
            return self.stoi[key]
        elif not isinstance(key, Iterable):
            return self.itos[key]
        elif len(key) > 0 and isinstance(key[0], str):
            return [self.stoi[i] for i in key]
        else:
            return [self.itos[i] for i in key]

    def __contains__(self, token):
        return token in self.stoi

    def __getstate__(self):
        # avoid picking defaultdict
        attrs = dict(self.__dict__)
        # cast to regular dict
        attrs['stoi'] = dict(self.stoi)
        return attrs

    def __setstate__(self, state):
        stoi = defaultdict(lambda: self.unk_index)
        stoi.update(state['stoi'])
        state['stoi'] = stoi
        self.__dict__.update(state)

    def items(self):
        return self.stoi.items()

    def update(self, vocab: Union[Iterable[str], Vocab, Counter]) -> Vocab:
        if isinstance(vocab, Vocab):
            vocab = vocab.itos
        # NOTE: PAY CAREFUL ATTENTION TO DICT ORDER UNDER DISTRIBUTED TRAINING!
        vocab = sorted(set(vocab).difference(self.stoi))
        self.itos.extend(vocab)
        self.stoi.update({token: i for i, token in enumerate(vocab, len(self.stoi))})
        return self
        

In [4]:
from transformers import AutoTokenizer

class Tokenizer():
    def __init__(self, name):
        self.name = name
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(name, local_files_only=True)
        except Exception:
            self.tokenizer = AutoTokenizer.from_pretrained(name, local_files_only=False)
            
    def __call__(self, text: str) -> List[str]:
        from tokenizers.pre_tokenizers import ByteLevel
        if isinstance(self.tokenizer.backend_tokenizer.pre_tokenizer, ByteLevel):
            text = ' ' + text
        return tuple(i.strip() for i in self.tokenizer.tokenize(text))

    def encode(self, text):
        return self.tokenizer.encode(text, return_tensors='pt')

    @property
    def vocab(self):
        return defaultdict(lambda: self.tokenizer.vocab[self.unk],
                           {**self.tokenizer.get_vocab(), **self.tokenizer.get_added_vocab()})
        
    @property
    def pad(self):
        return self.tokenizer.pad_token

    @property
    def unk(self):
        return self.tokenizer.unk_token

    @property
    def bos(self):
        return self.tokenizer.bos_token or self.tokenizer.cls_token

    @property
    def eos(self):
        return self.tokenizer.eos_token or self.tokenizer.sep_token

    def decode(self, text: List) -> str:
        return self.tokenizer.decode(text, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    def extend(self, data: Iterable[str], length: int = 32000) -> TransformerTokenizer:
        t = self.tokenizer.train_new_from_iterator(data, length)
        self.tokenizer.add_tokens(list(set(t.get_vocab()) - set(self.vocab)))
        return self

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, l_text, tokenizer):
        self.tokenizer = tokenizer
        self.pad = tokenizer.unk
        self.bos = tokenizer.bos
        self.eos = tokenizer.eos
        self.vocab = tokenizer.vocab
        self.texts = l_text

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        return text

    def collate_fn(self, data):
        encoded_text = self.tokenizer.tokenizer.batch_encode_plus(data, add_special_tokens=True, padding='longest', return_tensors='pt',
                                                                 return_attention_mask=True, return_special_tokens_mask=True)
        return encoded_text
    

In [6]:
df = load_data('./arabic_train_set.feather')
df = get_cleaned_sentence_list(list(df['document_plaintext']))
dts = CustomDataset(df, Tokenizer('aubmindlab/bert-base-arabertv2'))

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


In [7]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dts, batch_size=32, shuffle=True, collate_fn=dts.collate_fn)

In [39]:
for i, v in enumerate(dataloader):
    print(v)

Token indices sequence length is longer than the specified maximum sequence length for this model (1282 > 512). Running this sequence through the model will result in indexing errors


{'input_ids': tensor([[   33,   459, 27099,  ...,    31,    31,    31],
        [   33, 46656,  1067,  ...,    31,    31,    31],
        [   33,   369,  1058,  ...,    31,    31,    31],
        ...,
        [   33, 40526,  1078,  ...,    31,    31,    31],
        [   33,  4121,   251,  ...,    31,    31,    31],
        [   33,    53,    47,  ...,    31,    31,    31]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'special_tokens_mask': tensor([[1, 0, 0,  ..., 1, 1, 1],
        [1, 0, 0,  ..., 1, 1, 1],
        [1, 0, 0,  ..., 1, 1, 1],
        ...,
        [1,

In [40]:
len(dts.vocab)

64000

In [8]:
class Model(nn.Module):
    def __init__(self, name, vocab_size, tokenizer):
        super().__init__()
        from transformers import AutoModel
        try:
            self.model = AutoModel.from_pretrained(name, output_hidden_states=True, local_files_only=True)
        except Exception:
            self.model = AutoModel.from_pretrained(name, output_hidden_states=True, local_files_only=False)
        self.model.requires_grad_(False)
        self.max_len = self.model.config.max_position_embeddings
        self.n_out = vocab_size
        self.linear = nn.Linear(self.model.config.hidden_size, self.n_out, True)
    def forward(self, tokens, att_mask):
        x = self.model(tokens, attention_mask=att_mask.float())[-1]
        x = self.linear(x[-1])
        return x

In [9]:
model = Model('aubmindlab/bert-base-arabertv2', len(dts.vocab), dts.tokenizer.tokenizer)

In [10]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
device = torch.device('cuda')
crit = nn.CrossEntropyLoss()

model.train()
model = model.to(device)
for epoch in range(num_epochs):
    for batch in dataloader:
        x, att_mask, tok_mask = batch['input_ids'], batch['attention_mask'], batch['special_tokens_mask']
        # print(x.shape)
        x = x.to(device)
        att_mask = att_mask.to(device)
        tok_mask = tok_mask.to(device)
        max_len = min(model.max_len, x.shape[1])
        x = x[:, :max_len]
        att_mask = att_mask[:, :max_len]
        tok_mask = tok_mask[:, :max_len]
        
        out = model(x, att_mask)
        m = tok_mask.ne(1)[:, :-1]
        out = out[:, :-1, :][m]
        y = x[:, 1:][m]
        loss = crit(out, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


        

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


In [13]:
model.eval()
l = 0.0
with torch.no_grad():
    for batch in dataloader:
        x, att_mask, tok_mask = batch['input_ids'], batch['attention_mask'], batch['special_tokens_mask']
        # print(x.shape)
        x = x.to(device)
        att_mask = att_mask.to(device)
        tok_mask = tok_mask.to(device)
        max_len = min(model.max_len, x.shape[1])
        x = x[:, :max_len]
        att_mask = att_mask[:, :max_len]
        tok_mask = tok_mask[:, :max_len]
        
        out = model(x, att_mask)
        m = tok_mask.ne(1)[:, :-1]
        out = out[:, :-1, :][m]
        y = x[:, 1:][m]
        loss = crit(out, y)
        l += loss.item()
    l /= len(dataloader)
    print(np.exp(loss.item()))

31.410713792955896


In [88]:
from transformers import AutoModel, AutoTokenizer 

d = torch.device('cuda')
# Define the model repo
model_name = "aubmindlab/bert-base-arabertv2" 


# Download pytorch model
model = AutoModel.from_pretrained(model_name).to(d)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Transform input tokens 
inputs = tokenizer("Hello world!", return_tensors="pt").to(d)
print(inputs)
# Model apply
outputs = model(**inputs)

{'input_ids': tensor([[   33, 47794, 52959, 47491,    36,    34]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [96]:
torch.cuda.empty_cache()

In [15]:
torch.save(model.state_dict(), './bert_arabic_LM.pt')

In [16]:
len(dataloader)

925

In [17]:
from datasets import load_dataset
dataset = load_dataset("copenlu/answerable_tydiqa")
valid_set = dataset["validation"].filter(lambda example, idx: example['language'] == 'arabic', with_indices=True)

Filter: 100%|███████████████████████████████████████| 13325/13325 [00:00<00:00, 91127.45 examples/s]


In [18]:
df_val = get_cleaned_sentence_list(list(valid_set['document_plaintext']))
dts_val = CustomDataset(df_val, Tokenizer('aubmindlab/bert-base-arabertv2'))

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


In [25]:
val_dataloader = DataLoader(dts_val, batch_size=32, shuffle=False, collate_fn=dts_val.collate_fn)

In [27]:
model.eval()
l = 0.0
with torch.no_grad():
    for batch in val_dataloader:
        x, att_mask, tok_mask = batch['input_ids'], batch['attention_mask'], batch['special_tokens_mask']
        # print(x.shape)
        x = x.to(device)
        att_mask = att_mask.to(device)
        tok_mask = tok_mask.to(device)
        max_len = min(model.max_len, x.shape[1])
        x = x[:, :max_len]
        att_mask = att_mask[:, :max_len]
        tok_mask = tok_mask[:, :max_len]
        
        out = model(x, att_mask)
        m = tok_mask.ne(1)[:, :-1]
        out = out[:, :-1, :][m]
        y = x[:, 1:][m]
        loss = crit(out, y)
        l += loss.item()
    l /= len(val_dataloader)
    print(np.exp(l))

37.365664280993734


In [28]:
len(val_dataloader)

60