#installing dependencies

> Indented block



In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


This notebook works fine with transformers 4.12, it is not tested on newer versions

In [2]:
!pip install transformers==4.12.2
!pip install farasapy==0.0.14
!pip install pyarabic==0.6.14
!git clone https://github.com/aub-mind/arabert
!pip install emoji==1.6.1
!pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.12.2
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 34.3 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 80.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 71.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 66.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=40

Let's download some Arabic text classification datasets

#Creating training datasets

In [3]:
import pandas as pd
import numpy as np
from typing import List
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

This custom dataset class will help us hold our datasets in a structred manner.
It's not necessary to use it with your own data

In [4]:
class CustomDataset:
    def __init__(
        self,
        name: str,
        train: List[pd.DataFrame],
        test: List[pd.DataFrame],
        label_list: List[str],
    ):
        """Class to hold and structure datasets.

        Args:

        name (str): holds the name of the dataset so we can select it later
        train (List[pd.DataFrame]): holds training pandas dataframe with 2 columns ["text","label"]
        test (List[pd.DataFrame]): holds testing pandas dataframe with 2 columns ["text","label"]
        label_list (List[str]): holds the list  of labels
        """
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [5]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"
CATEGORY_COLUMN = "category"
STANCE_COLUMN = 'stance'
ID_COLUMN = 'id'

In [6]:
import pandas as pd
import numpy as np
import nltk


df_train = pd.read_csv("/content/train.csv")
df_dev = pd.read_csv("/content/dev.csv")
df_test = pd.read_csv("/content/test.csv")

df_train.columns = [DATA_COLUMN, CATEGORY_COLUMN , STANCE_COLUMN  ]
df_dev.columns   = [DATA_COLUMN, CATEGORY_COLUMN , STANCE_COLUMN  ]
df_test.columns  = [ID_COLUMN  , DATA_COLUMN ]

df_train[STANCE_COLUMN] = df_train[STANCE_COLUMN] + 1
df_dev[STANCE_COLUMN]   = df_dev[STANCE_COLUMN]  + 1


stance_map = {
     2: 'POSITIVE',
     1: 'NEUTRAL',
     0: 'NEGATIVE'
}


df_train[STANCE_COLUMN] = df_train[STANCE_COLUMN].apply(lambda x: stance_map[x])
df_dev[STANCE_COLUMN] = df_dev[STANCE_COLUMN].apply(lambda x: stance_map[x])

stance_list   = ['NEGATIVE', 'NEUTRAL' ,'POSITIVE' ]
category_list = ['advice', 'celebrity' ,'info_news', 'others' , 'personal' , 'plan' , 'requests', 'restrictions' , 'rumors' , 'unrelated' ]


df_train_stance = pd.DataFrame({
        "text":df_train[DATA_COLUMN],
        "stance": df_train[STANCE_COLUMN]
})

df_dev_stance = pd.DataFrame({
        "text":df_dev[DATA_COLUMN],
        "stance": df_dev[STANCE_COLUMN]
})

df_train_category = pd.DataFrame({
        "text":df_train[DATA_COLUMN],
        "category": df_train[CATEGORY_COLUMN]
})

df_dev_category = pd.DataFrame({
        "text":df_dev[DATA_COLUMN],
        "category": df_dev[CATEGORY_COLUMN]
})

dataset1 = CustomDataset("stance", df_train_stance, df_dev_stance, stance_list)
dataset2 = CustomDataset("category", df_train_category, df_dev_category, category_list)


print(df_train[STANCE_COLUMN].value_counts())
print(df_train[CATEGORY_COLUMN].value_counts())

POSITIVE    5538
NEUTRAL     1012
NEGATIVE     438
Name: stance, dtype: int64
info_news       3616
personal        1025
celebrity        975
plan             606
unrelated        323
others           167
requests         112
rumors            79
advice            67
restrictions      18
Name: category, dtype: int64


##ASTD- Unbalanced

#Trainer

Start the training procedure

In [7]:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import copy

from arabert.preprocess import ArabertPreprocessor
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from torch.utils.data import DataLoader, Dataset
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, BertTokenizer, Trainer,
                          TrainingArguments)
from transformers.data.processors.utils import InputFeatures

List all the datasets we have

In [8]:
dataset_name = 'category'
model_name = 'aubmindlab/bert-base-arabertv02-twitter' # we are going to use the twitter AraBERT since it has emojis and dialects

Create and apply preprocessing using the AraBERT processor

In [9]:
selected_dataset = copy.deepcopy(dataset2)
arabic_prep = ArabertPreprocessor(model_name)

selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(lambda x: arabic_prep.preprocess(x))
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(lambda x: arabic_prep.preprocess(x))  

Now we need to check the tokenized sentence length to decide on the maximum sentence length value

In [10]:
tok = AutoTokenizer.from_pretrained(model_name)
max_len = 101

Downloading:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Now let's create a classification dataset to load the data

In [11]:
class ClassificationDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(ClassificationDataset).__init__()
      """
      Args:
      text (List[str]): List of the training text
      target (List[str]): List of the training labels
      tokenizer_name (str): The tokenizer name (same as model_name).
      max_len (int): Maximum sentence length
      label_map (Dict[str,int]): A dictionary that maps the class labels to integer
      """
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
        
      inputs = self.tokenizer(
          text,
          max_length=self.max_len,
          padding='max_length',
          truncation=True
      )      
      return InputFeatures(**inputs,label=self.label_map[self.target[item]])

In [12]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)

train_dataset = ClassificationDataset(
    selected_dataset.train[DATA_COLUMN].to_list(),
    selected_dataset.train[CATEGORY_COLUMN].to_list(),
    model_name,
    max_len,
    label_map
  )
test_dataset = ClassificationDataset(
    selected_dataset.test[DATA_COLUMN].to_list(),
    selected_dataset.test[CATEGORY_COLUMN].to_list(),
    model_name,
    max_len,
    label_map
  )

{'advice': 0, 'celebrity': 1, 'info_news': 2, 'others': 3, 'personal': 4, 'plan': 5, 'requests': 6, 'restrictions': 7, 'rumors': 8, 'unrelated': 9}


Create a function that return a pretrained model ready to do classification

In [13]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

Define whatever metric you want here

In [14]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {       
      'macro_f1' : macro_f1,
      'accuracy': acc
  }

In [15]:
def set_seed(seed=42):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark = False

#Regular Training

Define our training parameters.
Check the TrainingArguments documentation for more options https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments

In [16]:
training_args = TrainingArguments( 
    output_dir= "./train",    
    adam_epsilon = 1e-8,
    learning_rate = 2e-4,
    fp16 = False, # enable this when using V100 or T4 GPU
    per_device_train_batch_size = 16, # up to 64 on 16GB with max len of 128
    per_device_eval_batch_size = 128,
    gradient_accumulation_steps = 2, # use this to scale batch size without needing more memory
    num_train_epochs= 4,
    warmup_ratio = 0,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'macro_f1',
    greater_is_better = True,
    seed = 25
  )

set_seed(training_args.seed)

Create the trainer

In [17]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/667 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/516M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmi

In [18]:
#start the training
trainer.train()

***** Running training *****
  Num examples = 6988
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 872


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy
0,No log,0.931261,0.336473,0.692
1,No log,0.9099,0.330246,0.698
2,0.865200,1.067639,0.41052,0.683
3,0.865200,1.224327,0.425773,0.667


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-218
Configuration saved in ./train/checkpoint-218/config.json
Model weights saved in ./train/checkpoint-218/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-436
Configuration saved in ./train/checkpoint-436/config.json
Model weights saved in ./train/checkpoint-436/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-654
Configuration saved in ./train/checkpoint-654/config.json
Model weights saved in ./train/checkpoint-654/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 128
Saving model checkpoint to ./train/checkpoint-872
Configuration saved in ./train/checkpoint-872/config.json
Model weights saved in ./train/checkpoint-872/pytorch_model.bin


Training completed. Do

TrainOutput(global_step=872, training_loss=0.6716084611525229, metrics={'train_runtime': 558.9864, 'train_samples_per_second': 50.005, 'train_steps_per_second': 1.56, 'total_flos': 1450267467282480.0, 'train_loss': 0.6716084611525229, 'epoch': 4.0})

Save the model, the tokenizer and the config

In [19]:
inv_label_map = inv_label_map = { v:k for k, v in label_map.items()}
print(inv_label_map)
trainer.model.config.label2id = label_map
trainer.model.config.id2label = inv_label_map
trainer.save_model("output_dir")
train_dataset.tokenizer.save_pretrained("output_dir")

Saving model checkpoint to output_dir
Configuration saved in output_dir/config.json


{0: 'advice', 1: 'celebrity', 2: 'info_news', 3: 'others', 4: 'personal', 5: 'plan', 6: 'requests', 7: 'restrictions', 8: 'rumors', 9: 'unrelated'}


Model weights saved in output_dir/pytorch_model.bin
tokenizer config file saved in output_dir/tokenizer_config.json
Special tokens file saved in output_dir/special_tokens_map.json


('output_dir/tokenizer_config.json',
 'output_dir/special_tokens_map.json',
 'output_dir/vocab.txt',
 'output_dir/added_tokens.json',
 'output_dir/tokenizer.json')

In [20]:
#copy the model to drive
!cp output_dir /content/drive/MyDrive

cp: -r not specified; omitting directory 'output_dir'


## predict using the saved model

In [21]:
from transformers import pipeline

In [22]:
# initialize pipline
pipe = pipeline("sentiment-analysis", model="output_dir", device=0, return_all_scores=True)

loading configuration file output_dir/config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-base-arabertv02-twitter",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "advice",
    "1": "celebrity",
    "2": "info_news",
    "3": "others",
    "4": "personal",
    "5": "plan",
    "6": "requests",
    "7": "restrictions",
    "8": "rumors",
    "9": "unrelated"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "advice": 0,
    "celebrity": 1,
    "info_news": 2,
    "others": 3,
    "personal": 4,
    "plan": 5,
    "requests": 6,
    "restrictions": 7,
    "rumors": 8,
    "unrelated": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_

In [26]:
def find_pred(res):
  return max(res[0], key=lambda item: item['score'])['label']

predcition_arr = df_dev['text'].apply(pipe)
predcition = predcition_arr.apply(find_pred)
print(predcition)

from sklearn.metrics import classification_report
print(classification_report(df_dev[CATEGORY_COLUMN],predcition))



0      info_news
1      info_news
2      celebrity
3      info_news
4      info_news
         ...    
995     personal
996    info_news
997         plan
998    info_news
999    info_news
Name: text, Length: 1000, dtype: object
              precision    recall  f1-score   support

      advice       0.40      0.20      0.27        10
   celebrity       0.86      0.88      0.87       145
   info_news       0.74      0.80      0.77       545
      others       0.07      0.06      0.06        17
    personal       0.54      0.56      0.55       128
        plan       0.35      0.27      0.31        82
    requests       0.23      0.15      0.18        20
restrictions       1.00      0.50      0.67         2
      rumors       0.38      0.20      0.26        15
   unrelated       0.54      0.42      0.47        36

    accuracy                           0.68      1000
   macro avg       0.51      0.40      0.44      1000
weighted avg       0.66      0.68      0.67      1000



In [27]:
def read_testset(path):
    csv = pd.read_csv(path)
    ids= csv["id"]
    text= csv["text"]
    return ids,text
test_ids,test_text= read_testset("/content/test.csv")


In [28]:
predcition_arr = test_text.apply(pipe)
predcition = predcition_arr.apply(find_pred)




In [29]:
def write_test_file(ids,category,path="/content/5.csv"):
    df= pd.DataFrame({
        "id":ids,
        "category": category          
    })
    df.to_csv(path,index= False)

write_test_file(test_ids,predcition)

In [30]:
# mount it
from google.colab import drive
drive.mount('/content/drive')
# copy it there
!cp -r /content/output_dir /content/drive/MyDrive

Mounted at /content/drive
