In [1]:
%%capture
pip install transformers

In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten,Input,GlobalMaxPool1D,Dropout
from tensorflow import keras
from keras.utils import to_categorical
import numpy as np
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
from keras.losses import SparseCategoricalCrossentropy
import joblib

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df_final = pd.read_pickle('/content/drive/MyDrive/dialects_data_preprocessed.pkl')

In [7]:
df_final.head()

Unnamed: 0,id,text,dialect
0,1009754958479151232,قليلين ادب ومنافقين اختهم او قريبتهم تتعاكس تق...,LY
1,1009794751548313600,الليبيين متقلبين بالنسبه ليا انا ميليشياوي زما...,LY
2,1019989115490787200,تانيه شاب ليبي بيرتاح لبنت مختلفه ويلاحظ انها ...,LY
3,1035479791758135168,رانيا عقليتك متخلفه اولا الانسان يلي يحتاج اهل...,LY
4,1035481122921164800,شكلك متعقده علشان الراجل تحبيه ازوج بنت يتيمه ...,LY


In [8]:
max_length = df_final['text'].apply(lambda x: len(x.split())).max()


In [9]:
max_length

61

# Mbert Model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df_final["dialect"])

In [11]:
df_final['dialect'] = y

In [None]:
from transformers import AutoTokenizer, TFBertModel


model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
initial_model = TFBertModel.from_pretrained(model_name)

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [12]:
data_train, data_test = train_test_split(df_final, test_size = 0.2, random_state = 42,stratify = df_final.dialect)

In [13]:
data_vali, data_test = train_test_split(data_test, test_size = 0.5, random_state = 42,stratify = data_test.dialect)

In [None]:
x_train = tokenizer(
        list(data_train["text"].values),
        max_length=max_length,
        stride=16,
        add_special_tokens=True,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
    )

x_vali = tokenizer(
        list(data_vali["text"].values),
        max_length=max_length,
        stride=16,
        add_special_tokens=True,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
    )

x_test = tokenizer(
        list(data_test["text"].values),
        max_length=max_length,
        stride=16,
        add_special_tokens=True,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
    )


In [None]:
input_ids = Input(shape=(max_length,), dtype=tf.int32,name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32,name='attention_mask')
embedding = initial_model(
    input_ids, attention_mask=attention_mask)[0]
max_layer = GlobalMaxPool1D()(embedding)
layer_1 = Dense(256, activation="relu",kernel_initializer = 'he_normal')(max_layer)
dropped_1 = Dropout(0.1)(layer_1)
layer_2 = Dense(128, activation="relu",kernel_initializer = 'he_normal')(dropped_1)
layer_3 = Dense(64, activation="relu",kernel_initializer = 'he_normal')(layer_2)
dropped_2 = Dropout(0.1)(layer_3)
layer_4 = Dense(32, activation="relu",kernel_initializer = 'he_normal')(dropped_2)
final_layer = Dense(5, activation="softmax",kernel_initializer = 'he_normal')(layer_4)


model = keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=[final_layer],
    )


In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 61)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 61)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  177853440   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 61,                                            

In [None]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(
    learning_rate=1e-5,
    weight_decay_rate=0.01,
    epsilon=1e-8

)

In [None]:
model.compile(optimizer=optimizer,
              loss=SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
history = model.fit(
    x = {'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']},
    y = data_train.dialect,
    validation_data =({'input_ids':x_vali['input_ids'],'attention_mask':x_vali['attention_mask']},data_vali.dialect),
    epochs =3,
    batch_size = 64
)

Epoch 1/3




Epoch 2/3
Epoch 3/3


In [None]:
predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})



In [None]:
y_pred = np.argmax(predicted,axis = 1)

In [None]:
print(classification_report(data_test.dialect,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      5764
           1       0.84      0.82      0.83      2762
           2       0.83      0.75      0.79      3650
           3       0.68      0.69      0.69      1154
           4       0.67      0.63      0.65      1443

    accuracy                           0.80     14773
   macro avg       0.77      0.76      0.76     14773
weighted avg       0.80      0.80      0.80     14773



# Arabert

In [14]:
from transformers import AutoTokenizer, TFAutoModel
model_name = "aubmindlab/bert-base-arabertv2"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)
arabert_model = TFAutoModel.from_pretrained(model_name)


Downloading (…)okenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/742M [00:00<?, ?B/s]

Some layers from the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at aubmindlab/bert-base-arabertv2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
x_train = arabert_tokenizer(
        list(data_train["text"].values),
        max_length=max_length,
        stride=16,
        add_special_tokens=True,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
    )

x_vali = arabert_tokenizer(
        list(data_vali["text"].values),
        max_length=max_length,
        stride=16,
        add_special_tokens=True,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
    )

x_test = arabert_tokenizer(
        list(data_test["text"].values),
        max_length=max_length,
        stride=16,
        add_special_tokens=True,
        padding="max_length",
        truncation = True,
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
    )


In [16]:
input_ids = Input(shape=(max_length,), dtype=tf.int32,name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32,name='attention_mask')
embedding = arabert_model(
    input_ids, attention_mask=attention_mask)[0]
max_layer = GlobalMaxPool1D()(embedding)
layer_1 = Dense(256, activation="relu",kernel_initializer = 'he_normal')(max_layer)
dropped_1 = Dropout(0.1)(layer_1)
layer_2 = Dense(128, activation="relu",kernel_initializer = 'he_normal')(dropped_1)
layer_3 = Dense(64, activation="relu",kernel_initializer = 'he_normal')(layer_2)
dropped_2 = Dropout(0.1)(layer_3)
layer_4 = Dense(32, activation="relu",kernel_initializer = 'he_normal')(dropped_2)
final_layer = Dense(5, activation="softmax",kernel_initializer = 'he_normal')(layer_4)


model = keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=[final_layer],
    )

In [17]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(
    learning_rate=1e-5,
    weight_decay_rate=0.01,
    epsilon=1e-8

)

In [18]:
model.compile(optimizer=optimizer,
              loss=SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [19]:
history_2 = model.fit(
    x = {'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']},
    y = data_train.dialect,
    validation_data =({'input_ids':x_vali['input_ids'],'attention_mask':x_vali['attention_mask']},data_vali.dialect),
    epochs =2,
    batch_size = 64
)

Epoch 1/2




Epoch 2/2


In [20]:
predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})



In [21]:
y_pred = np.argmax(predicted,axis = 1)

In [22]:
print(classification_report(data_test.dialect,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      5764
           1       0.87      0.79      0.83      2762
           2       0.73      0.78      0.75      3650
           3       0.82      0.62      0.70      1154
           4       0.78      0.56      0.65      1443

    accuracy                           0.80     14773
   macro avg       0.80      0.73      0.76     14773
weighted avg       0.80      0.80      0.79     14773

