In [1]:
import transformers
from arabert import ArabertPreprocessor 
from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel
print("Sucessfully imported Arabert and Grover GPT-2 model") 


Sucessfully imported Arabert and Grover GPT-2 model


# Fine-tuning Pipeline 
## Data Loading 
  
## Preprocessing 
## Training 


In [2]:
import pandas as pd 
import numpy as np 
# train test split dataset from sklearn 
from sklearn.model_selection import train_test_split  

model_name = "aubmindlab/aragpt2-base"  
dataset_path = "../datasets/Moroccan_Darija_Offensive_Language_Detection_Dataset.csv"

# Load dataset 
df = pd.read_csv(dataset_path) 
print(f"Dataset Loaded {len(df)} tweets")
print(df.head()) 






Dataset Loaded 20402 tweets
                                                text  label
0  والله ما كاين لخرج علي تلاميذ من غيركم لجينه ن...      0
1  لا توجد مستعمراات اسبانيه ولكن توجد المساله ال...      0
2  akbare nasabe nahaba lmalayire layatawara3o fi...      1
3  h 9dima dik l3ba yama hergthom bach tchdo assu...      0
4  نحن درنا التلقيح ورغم دلك اصبنا ملي درنا التلق...      0


In [3]:
# init arabert preprocessor 
arabert_prep = ArabertPreprocessor(model_name=model_name)
train_tweets, val_tweets, train_labels, val_labels = train_test_split(
    df['text'].tolist(), 
    df['label'].tolist(),
    test_size=0.25, 
    random_state=30,
)

In [13]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch 
from huggingface_hub import snapshot_download 


# Tokenization 
model_path = snapshot_download(repo_id=model_name) 


tokenizer = GPT2Tokenizer.from_pretrained(model_path) 
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a pad token, so we use eos token

def tokenize_function(tweets, labels) : 
    encodings = tokenizer(tweets, truncation=True, padding=True, max_length=128) 
    data = [] 
    for i in range(len(tweets)) : 
        row = {key: torch.tensor(value[i]) for key, value in encodings.items()}
        row['labels'] = torch.tensor(labels[i])
        data.append(row) 
    return data

train_dataset = tokenize_function(train_tweets, train_labels)
val_dataset = tokenize_function(val_tweets, val_labels)
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}") 




Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Didn't find file /Users/saidibenariba/.cache/huggingface/hub/models--aubmindlab--aragpt2-base/snapshots/257d7289dac3605a6cd903ff71d67a52fc5d09aa/added_tokens.json. We won't load it.
Didn't find file /Users/saidibenariba/.cache/huggingface/hub/models--aubmindlab--aragpt2-base/snapshots/257d7289dac3605a6cd903ff71d67a52fc5d09aa/special_tokens_map.json. We won't load it.
Didn't find file /Users/saidibenariba/.cache/huggingface/hub/models--aubmindlab--aragpt2-base/snapshots/257d7289dac3605a6cd903ff71d67a52fc5d09aa/tokenizer_config.json. We won't load it.
loading file /Users/saidibenariba/.cache/huggingface/hub/models--aubmindlab--aragpt2-base/snapshots/257d7289dac3605a6cd903ff71d67a52fc5d09aa/vocab.json
loading file /Users/saidibenariba/.cache/huggingface/hub/models--aubmindlab--aragpt2-base/snapshots/257d7289dac3605a6cd903ff71d67a52fc5d09aa/merges.txt
loading file None
loading file None
loading file None
loading configuration file /Users/saidibenariba/.cache/huggingface/hub/models--aubmind

Train dataset size: 15301
Validation dataset size: 5101


In [14]:
## model loading 
model = GPT2ForSequenceClassification.from_pretrained(model_path, num_labels=2)
model.config.pad_token_id = model.config.eos_token_id  # Set pad token ID to eos token ID

print("Model and tokenizer are set up and ready for training.")

loading configuration file /Users/saidibenariba/.cache/huggingface/hub/models--aubmindlab--aragpt2-base/snapshots/257d7289dac3605a6cd903ff71d67a52fc5d09aa/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50,
      "no_repeat_ngra

Model and tokenizer are set up and ready for training.


In [None]:
from sklearn.metrics import accuracy_score, f1_score 
def compute_metrics(pred) : 
    labels = pred.label_ids 
    preds = pred.predictions.argmax(-1) 
    acc = accuracy_score(labels, preds) 
    f1 = f1_score(labels, preds, average='weighted') 
    return {
        'accuracy' : acc, 
        'f1' : f1
    }
training_arguments = TrainingArguments(
    output_dir="./results_aragpt2", 
    num_train_epochs=3, 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    logging_dir="./logs", 
    logging_steps=10, 
    load_best_model_at_end=True, 
    metric_for_best_model="f1",
) 
trainer = Trainer(
    model=model, 
    args=training_arguments, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    compute_metrics=compute_metrics,
)
print("Start Fine tuning...") 
trainer.train() 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 15301
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5739


Start Fine tuning...


  0%|          | 0/5739 [00:00<?, ?it/s]

{'loss': 0.7729, 'learning_rate': 4.9912876807806244e-05, 'epoch': 0.01}
{'loss': 0.5901, 'learning_rate': 4.982575361561248e-05, 'epoch': 0.01}
{'loss': 0.6777, 'learning_rate': 4.973863042341871e-05, 'epoch': 0.02}
{'loss': 0.6083, 'learning_rate': 4.9651507231224954e-05, 'epoch': 0.02}
{'loss': 0.7049, 'learning_rate': 4.956438403903119e-05, 'epoch': 0.03}
{'loss': 0.6828, 'learning_rate': 4.947726084683743e-05, 'epoch': 0.03}
{'loss': 0.6062, 'learning_rate': 4.939013765464367e-05, 'epoch': 0.04}
{'loss': 0.6701, 'learning_rate': 4.9303014462449906e-05, 'epoch': 0.04}
{'loss': 0.6275, 'learning_rate': 4.921589127025614e-05, 'epoch': 0.05}
{'loss': 0.7139, 'learning_rate': 4.912876807806238e-05, 'epoch': 0.05}
{'loss': 0.5956, 'learning_rate': 4.9041644885868624e-05, 'epoch': 0.06}
{'loss': 0.6442, 'learning_rate': 4.895452169367486e-05, 'epoch': 0.06}
