#installing dependencies

> Indented block



In [30]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


This notebook works fine with transformers 4.12, it is not tested on newer versions

In [31]:
!pip install transformers==4.12.2
!pip install farasapy==0.0.14
!pip install pyarabic==0.6.14
!git clone https://github.com/aub-mind/arabert
!pip install emoji==1.6.1
!pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'arabert' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Let's download some Arabic text classification datasets

#Creating training datasets

In [32]:
import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

This custom dataset class will help us hold our datasets in a structred manner.
It's not necessary to use it with your own data

In [33]:
class CustomDataset:
    def __init__(
        self,
        name: str,
        train: List[pd.DataFrame],
        test: List[pd.DataFrame],
        label_list: List[str],
    ):
        """Class to hold and structure datasets.

        Args:

        name (str): holds the name of the dataset so we can select it later
        train (List[pd.DataFrame]): holds training pandas dataframe with 2 columns ["text","label"]
        test (List[pd.DataFrame]): holds testing pandas dataframe with 2 columns ["text","label"]
        label_list (List[str]): holds the list  of labels
        """
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [34]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"
CATEGORY_COLUMN = "category"
STANCE_COLUMN = 'stance'
ID_COLUMN = 'id'

In [35]:
import pandas as pd
import numpy as np
import nltk


df_train = pd.read_csv("/content/train.csv")
df_dev = pd.read_csv("/content/dev.csv")
df_test = pd.read_csv("/content/test.csv")

df_train.columns = [DATA_COLUMN, CATEGORY_COLUMN , STANCE_COLUMN  ]
df_dev.columns   = [DATA_COLUMN, CATEGORY_COLUMN , STANCE_COLUMN  ]
df_test.columns  = [ID_COLUMN  , DATA_COLUMN ]

df_train[STANCE_COLUMN] = df_train[STANCE_COLUMN] + 1
df_dev[STANCE_COLUMN]   = df_dev[STANCE_COLUMN]  + 1


stance_map = {
     2: 'POSITIVE',
     1: 'NEUTRAL',
     0: 'NEGATIVE'
}


df_train[STANCE_COLUMN] = df_train[STANCE_COLUMN].apply(lambda x: stance_map[x])
df_dev[STANCE_COLUMN] = df_dev[STANCE_COLUMN].apply(lambda x: stance_map[x])

stance_list   = ['NEGATIVE', 'NEUTRAL' ,'POSITIVE' ]
category_list = ['advice', 'celebrity' ,'info_news', 'others' , 'personal' , 'plan' , 'requests', 'restrictions' , 'rumors' , 'unrelated' ]


df_train_stance = pd.DataFrame({
        "text":df_train[DATA_COLUMN],
        "stance": df_train[STANCE_COLUMN]
})

df_dev_stance = pd.DataFrame({
        "text":df_dev[DATA_COLUMN],
        "stance": df_dev[STANCE_COLUMN]
})

df_train_category = pd.DataFrame({
        "text":df_train[DATA_COLUMN],
        "category": df_train[CATEGORY_COLUMN]
})

df_dev_category = pd.DataFrame({
        "text":df_dev[DATA_COLUMN],
        "category": df_dev[CATEGORY_COLUMN]
})

dataset1 = CustomDataset("stance", df_train_stance, df_dev_stance, stance_list)
dataset2 = CustomDataset("category", df_train_category, df_dev_category, category_list)


print(df_train[STANCE_COLUMN].value_counts())
print(df_train[CATEGORY_COLUMN].value_counts())

POSITIVE    5538
NEUTRAL     1012
NEGATIVE     438
Name: stance, dtype: int64
info_news       3616
personal        1025
celebrity        975
plan             606
unrelated        323
others           167
requests         112
rumors            79
advice            67
restrictions      18
Name: category, dtype: int64


##ASTD- Unbalanced

#Trainer

Start the training procedure

In [36]:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import copy

from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from torch.utils.data import DataLoader, Dataset
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, BertTokenizer, Trainer,
                          TrainingArguments)
from transformers.data.processors.utils import InputFeatures

List all the datasets we have

In [37]:
dataset_name = 'stance'
model_name = 'aubmindlab/bert-large-arabertv02-twitter' # we are going to use the twitter AraBERT since it has emojis and dialects

Create and apply preprocessing using the AraBERT processor

In [38]:
selected_dataset = copy.deepcopy(dataset1)
arabic_prep = ArabertPreprocessor(model_name)

selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(lambda x: arabic_prep.preprocess(x))
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(lambda x: arabic_prep.preprocess(x))  

Now we need to check the tokenized sentence length to decide on the maximum sentence length value

In [39]:
tok = AutoTokenizer.from_pretrained(model_name)
max_len = 101

https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpt6dz6ukw


Downloading:   0%|          | 0.00/456 [00:00<?, ?B/s]

storing https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/5358ed8d7c6c053f551644f7ca3e35e10b4dd73ec064fd79bfc58c809222f89b.23e65ad7ac44f654668403bc1e8a64c434089184f9aa116b7870e318d9ceed7d
creating metadata file for /root/.cache/huggingface/transformers/5358ed8d7c6c053f551644f7ca3e35e10b4dd73ec064fd79bfc58c809222f89b.23e65ad7ac44f654668403bc1e8a64c434089184f9aa116b7870e318d9ceed7d
https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpvryq5abb


Downloading:   0%|          | 0.00/795k [00:00<?, ?B/s]

storing https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/5a8d9cfae5292220da4c510cc99a755ba1dd29bf0464fc146b75f8856c2c997f.c75624709974ce1c06c0a39ae4541636d4de56d62c43067ae6e161868adba40d
creating metadata file for /root/.cache/huggingface/transformers/5a8d9cfae5292220da4c510cc99a755ba1dd29bf0464fc146b75f8856c2c997f.c75624709974ce1c06c0a39ae4541636d4de56d62c43067ae6e161868adba40d
https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpvi0xogj8


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/2949560bf32460a4457c53f9aaaebac4634e3b5b528cfac1d518fa3239229b88.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/2949560bf32460a4457c53f9aaaebac4634e3b5b528cfac1d518fa3239229b88.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/5a8d9cfae5292220da4c510cc99a755ba1dd29bf0464fc146b75f8856c2c997f.c75624709974ce1c06c0a39ae4541636d4de56d62c43067ae6e161868adba40d
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/added_tokens.json fr

Now let's create a classification dataset to load the data

In [40]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(ClassificationDataset).__init__()
      """
      Args:
      text (List[str]): List of the training text
      target (List[str]): List of the training labels
      tokenizer_name (str): The tokenizer name (same as model_name).
      max_len (int): Maximum sentence length
      label_map (Dict[str,int]): A dictionary that maps the class labels to integer
      """
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
        
      inputs = self.tokenizer(
          text,
          max_length=self.max_len,
          padding='max_length',
          truncation=True
      )      
      return InputFeatures(**inputs,label=self.label_map[self.target[item]])

In [41]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)

train_dataset = ClassificationDataset(
    selected_dataset.train[DATA_COLUMN].to_list(),
    selected_dataset.train[STANCE_COLUMN].to_list(),
    model_name,
    max_len,
    label_map
  )
test_dataset = ClassificationDataset(
    selected_dataset.test[DATA_COLUMN].to_list(),
    selected_dataset.test[STANCE_COLUMN].to_list(),
    model_name,
    max_len,
    label_map
  )

{'NEGATIVE': 0, 'NEUTRAL': 1, 'POSITIVE': 2}


loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/5a8d9cfae5292220da4c510cc99a755ba1dd29bf0464fc146b75f8856c2c997f.c75624709974ce1c06c0a39ae4541636d4de56d62c43067ae6e161868adba40d
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/2949560bf32460a4457c53f9aaaebac4634e3b5b528cfac1d518fa3239229b88.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/5358ed8

Create a function that return a pretrained model ready to do classification

In [42]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

Define whatever metric you want here

In [43]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {       
      'macro_f1' : macro_f1,
      'accuracy': acc
  }

In [44]:
def set_seed(seed=42):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark = False

#Regular Training

Define our training parameters.
Check the TrainingArguments documentation for more options https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments

In [45]:
training_args = TrainingArguments( 
    output_dir= "./train",    
    adam_epsilon = 1e-8,
    learning_rate = 2e-5,
    fp16 = False, # enable this when using V100 or T4 GPU
    per_device_train_batch_size = 16, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs= 4,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 25
  )

set_seed(training_args.seed)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Create the trainer

In [46]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp_2gy8dv3


Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

storing https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/d9c5c84e874ab453668f4d4103039082cd50a0fbf6677cd935661dce907dac2a.406ec4f486aa88dc5da91165d2bebd998efa9fc277bb209a99c5fb73ae258a40
creating metadata file for /root/.cache/huggingface/transformers/d9c5c84e874ab453668f4d4103039082cd50a0fbf6677cd935661dce907dac2a.406ec4f486aa88dc5da91165d2bebd998efa9fc277bb209a99c5fb73ae258a40
loading configuration file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d9c5c84e874ab453668f4d4103039082cd50a0fbf6677cd935661dce907dac2a.406ec4f486aa88dc5da91165d2bebd998efa9fc277bb209a99c5fb73ae258a40
Model config BertConfig {
  "_name_or_path": "models/bert-large-arabertv02",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
 

Downloading:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

storing https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/612accb927b9184250c5f5efd1263c38581be8fc672937cc6f7540a97c63f333.ea90a3ece1102e24cf3d170960a2e998cd5dd780f288bf606c016f38dd9e66fe
creating metadata file for /root/.cache/huggingface/transformers/612accb927b9184250c5f5efd1263c38581be8fc672937cc6f7540a97c63f333.ea90a3ece1102e24cf3d170960a2e998cd5dd780f288bf606c016f38dd9e66fe
loading weights file https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/612accb927b9184250c5f5efd1263c38581be8fc672937cc6f7540a97c63f333.ea90a3ece1102e24cf3d170960a2e998cd5dd780f288bf606c016f38dd9e66fe
Some weights of the model checkpoint at aubmindlab/bert-large-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predicti

In [47]:
#start the training
trainer.train()

***** Running training *****
  Num examples = 6988
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 872


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy
0,No log,0.367976,0.648731,0.851
1,No log,0.377096,0.657249,0.844
2,0.357100,0.425094,0.657872,0.847
3,0.357100,0.45992,0.652441,0.847


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-218
Configuration saved in ./train/checkpoint-218/config.json
Model weights saved in ./train/checkpoint-218/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-436
Configuration saved in ./train/checkpoint-436/config.json
Model weights saved in ./train/checkpoint-436/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-654
Configuration saved in ./train/checkpoint-654/config.json
Model weights saved in ./train/checkpoint-654/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-872
Configuration saved in ./train/checkpoint-872/config.json
Model weights saved in ./train/checkpoint-872/pytorch_model.bin


Training completed. Do

TrainOutput(global_step=872, training_loss=0.28937673131260305, metrics={'train_runtime': 1763.0153, 'train_samples_per_second': 15.855, 'train_steps_per_second': 0.495, 'total_flos': 5136451713192360.0, 'train_loss': 0.28937673131260305, 'epoch': 4.0})

Save the model, the tokenizer and the config

In [48]:
inv_label_map = inv_label_map = { v:k for k, v in label_map.items()}
print(inv_label_map)
trainer.model.config.label2id = label_map
trainer.model.config.id2label = inv_label_map
trainer.save_model("output_dir")
train_dataset.tokenizer.save_pretrained("output_dir")

Saving model checkpoint to output_dir
Configuration saved in output_dir/config.json


{0: 'NEGATIVE', 1: 'NEUTRAL', 2: 'POSITIVE'}


Model weights saved in output_dir/pytorch_model.bin
tokenizer config file saved in output_dir/tokenizer_config.json
Special tokens file saved in output_dir/special_tokens_map.json


('output_dir/tokenizer_config.json',
 'output_dir/special_tokens_map.json',
 'output_dir/vocab.txt',
 'output_dir/added_tokens.json',
 'output_dir/tokenizer.json')

In [49]:
#copy the model to drive
!cp output_dir /content/drive/MyDrive

cp: -r not specified; omitting directory 'output_dir'


## predict using the saved model

In [50]:
from transformers import pipeline

In [51]:
# initialize pipline
pipe = pipeline("sentiment-analysis", model="output_dir", device=0, return_all_scores=True)

loading configuration file output_dir/config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-large-arabertv02-twitter",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "NEGATIVE",
    "1": "NEUTRAL",
    "2": "POSITIVE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "NEGATIVE": 0,
    "NEUTRAL": 1,
    "POSITIVE": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}

loading configur

In [52]:
def find_pred(res):
  return max(res[0], key=lambda item: item['score'])['label']

predcition_arr = df_dev['text'].apply(pipe)
predcition = predcition_arr.apply(find_pred)
print(predcition)

from sklearn.metrics import classification_report
print(classification_report(df_dev[STANCE_COLUMN],predcition))



0      POSITIVE
1      POSITIVE
2      POSITIVE
3      POSITIVE
4      POSITIVE
         ...   
995    POSITIVE
996    POSITIVE
997    POSITIVE
998     NEUTRAL
999    POSITIVE
Name: text, Length: 1000, dtype: object
              precision    recall  f1-score   support

    NEGATIVE       0.60      0.54      0.57        70
     NEUTRAL       0.55      0.48      0.52       126
    POSITIVE       0.91      0.93      0.92       804

    accuracy                           0.85      1000
   macro avg       0.69      0.65      0.67      1000
weighted avg       0.84      0.85      0.84      1000



In [53]:
def read_testset(path):
    csv = pd.read_csv(path)
    ids= csv["id"]
    text= csv["text"]
    return ids,text
test_ids,test_text= read_testset("/content/test.csv")


In [54]:
predcition_arr = test_text.apply(pipe)
predcition = predcition_arr.apply(find_pred)




In [55]:
def write_test_file(ids,stances,path="/content/5.csv"):
    df= pd.DataFrame({
        "id":ids,
        "stance": stances          
    })
    df.to_csv(path,index= False)

write_test_file(test_ids,predcition)

In [56]:
# mount it
from google.colab import drive
drive.mount('/content/drive')
# copy it there
!cp -r /content/output_dir /content/drive/MyDrive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
