# Using AraBERT for Word and Sentence Embeddings

In [1]:
!git clone https://github.com/aub-mind/arabert.git

Cloning into 'arabert'...


In [1]:
#import our needed modules and libraries
import farasa
import pyarabic
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
model_name="aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name) # define the preprocessing model
model = AutoModel.from_pretrained(model_name) # define the model 
tokenizer = AutoTokenizer.from_pretrained(model_name) # define the tokenizer that change the text to tensors

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# preprocess sample text using the preprocessing model we defined
text =  " ""كان الملك رجلا و الملكة مرأة"
text_preprocessed = arabert_prep.preprocess(text)
print(text)
print("---------------------")
print(text_preprocessed)

 كان الملك رجلا و الملكة مرأة
---------------------
كان ال+ ملك رجل +ا و ال+ ملك +ة مرأ +ة


In [4]:
#Converting the text to tensors suitable for model input
inputs = tokenizer.encode_plus(text_preprocessed, return_tensors='pt')
inputs.keys()
#inputs is a dictionary containing inputs_ids, attention_masks and token_type_ids as pytorch tensors

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [5]:
print(inputs['input_ids'][0]) # we have only 1 sentence consists of 11 segments and start token [CLS] and end token [SEP]
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

tensor([  33,  369,   20,  880, 1486,    0,  166,   20,  880,   12, 1637,   12,
          34])
['[CLS]', 'كان', 'ال+', 'ملك', 'رجل', '+ا', 'و', 'ال+', 'ملك', '+ة', 'مرأ', '+ة', '[SEP]']


In [6]:
outputs = model(**inputs) # pass the preprocessed text tensors to the model
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [7]:
embeddings = outputs['last_hidden_state'] # contains the embeddings for each individual word
embeddings.shape # batch_size (number of sentences) x seq_len (sentence length) x emb_dim

torch.Size([1, 13, 768])

In [8]:
embeddings

tensor([[[ 0.2024, -0.3766, -0.3106,  ..., -0.1226, -0.2897,  0.0998],
         [ 0.4053,  0.1943,  0.0517,  ...,  0.3365,  0.1256, -0.5971],
         [-0.1299, -0.0100,  0.1046,  ..., -0.0133, -0.7689, -0.1409],
         ...,
         [-0.1666,  0.0128,  0.2723,  ...,  0.3316, -0.6353, -0.8559],
         [-0.1299, -0.0102,  0.1046,  ..., -0.0136, -0.7685, -0.1406],
         [ 0.2700, -0.0583, -0.4694,  ...,  0.1925, -0.0790, -0.0127]]],
       grad_fn=<NativeLayerNormBackward0>)

In [9]:
embeddings_text_only = outputs['last_hidden_state'][0][1:-1] # without [CLS] and [SEP]
embeddings_text_only.shape # (seq_len - 2) x emb_dim

torch.Size([11, 768])

In [10]:
pooled_vector = outputs['pooler_output'] # has the embeddings of the whole sentences
pooled_vector.shape # batch_size x emb_dim

torch.Size([1, 768])