<a href="https://colab.research.google.com/github/Skander28/Models/blob/main/XLMRoberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.  

    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi
    
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
Fri May  5 07:43:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    27W /  70W |   6945MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------

In [30]:
# install needed libraries
!pip install pyarabic
!pip install emoji
!pip install pystemmer
!pip install optuna==2.3.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
# import needed libraries
import numpy as np
import pandas as pd
import pyarabic.araby as ar

import re , emoji, Stemmer, functools, operator, string
import torch , optuna, gc, random, os

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample

import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [33]:

from google.colab import drive
drive.mount('/content/drive')
     
import pandas as pd
tweets_ids = pd.read_csv('/content/drive/MyDrive/dialect_dataset.csv')
     

# Read the given dataset which contain the Ids and the labels
df_clean = pd.read_csv('/content/drive/MyDrive/messages.csv',
                 lineterminator='\n')

column_names = ['id', 'tweets']  # list of column names

df_clean.columns = column_names

tweets_dataset = pd.merge(tweets_ids, df_clean, on='id', how='inner')
  
filtered_df = tweets_dataset[tweets_dataset['dialect'].isin(['TN','DZ','MA','LY'])]
filtered_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,id,dialect,tweets
15497,1009754958479151232,LY,@toha_Altomy @gy_yah قليلين ادب ومنافقين. لو ا...
15498,1009794751548313600,LY,@AlmFaisal 😂😂 الليبيين متقلبين!!!\nبس بالنسبة ...
15499,1019989115490787200,LY,@smsm071990 @ALMOGRBE كل 20 تانيه شاب ليبي بير...
15500,1035479791758135168,LY,@AboryPro @lyranoo85 رانيا عقليتك متخلفة. اولا...
15501,1035481122921164800,LY,@lyranoo85 شكلك متعقدة علشان الراجل لي تحبيه ا...


In [34]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(filtered_df.drop('dialect', axis=1), filtered_df['dialect'])
filtered_df = pd.concat([X_resampled, y_resampled], axis=1)

In [35]:
# a class representing the dataset
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [36]:
class BERTModelDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTModelDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
  
    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
    
      encoded_review = self.tokenizer.encode_plus(
      text,
      max_length= self.max_len,
      add_special_tokens= True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation='longest_first',
      return_attention_mask=True,
      return_tensors='pt'
    )
      input_ids = encoded_review['input_ids'].to(device)
      attention_mask = encoded_review['attention_mask'].to(device)

      return InputFeatures(input_ids=input_ids.flatten(), attention_mask=attention_mask.flatten(), label=self.label_map[self.target[item]])

In [37]:
def data_cleaning (text):
  try:
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"https\S+", "", text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub("(\s\d+)","",text) 
    text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
    text = re.sub("\d+", " ", text)
    text = ar.strip_tashkeel(text)
    text = ar.strip_tatweel(text)
    text = text.replace("#", " ");
    text = text.replace("@", " ");
    text = text.replace("_", " ");
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    text = text.replace("آ", "ا")
    text = text.replace("إ", "ا")
    text = text.replace("أ", "ا")
    text = text.replace("ؤ", "و")
    text = text.replace("ئ", "ي")
  except:
    return text
   
  return text

In [38]:
# Cleaning Training Data 
filtered_df['tweets'] = filtered_df['tweets'].apply(lambda x:   data_cleaning(x))

In [39]:
Extra_Len = 6 # an extra padding in length , found to be useful for increasing F-score
Max_Len = filtered_df["tweets"].str.split().str.len().max() + Extra_Len

print(Max_Len)

#Spliting the Training data
Test_Size = 0.15
Rand_Seed = 42 

# Split original data into train and test sets
train_set, test_set = train_test_split(filtered_df, test_size=Test_Size, random_state=Rand_Seed + 1)

# Split training data into train and validation sets
train_set, evaluation_set = train_test_split(train_set, test_size=Test_Size, random_state=Rand_Seed)

print("Train set: ")
print(train_set["dialect"].value_counts())
print("---------------------------")
print ("Evaluation set: ")
print (evaluation_set["dialect"].value_counts())
print("---------------------------")
print ("test set: ")
print (test_set["dialect"].value_counts())

94
Train set: 
LY    26499
DZ    26399
TN    26313
MA    26270
Name: dialect, dtype: int64
---------------------------
Evaluation set: 
TN    4741
MA    4706
DZ    4695
LY    4473
Name: dialect, dtype: int64
---------------------------
test set: 
LY    5527
MA    5523
TN    5445
DZ    5405
Name: dialect, dtype: int64


In [40]:
Model_Used = "Davlan/xlm-roberta-base-finetuned-arabic"
Task_Name = "classification"

def model_init():
  return AutoModelForSequenceClassification.from_pretrained(Model_Used, return_dict=True, num_labels=len(label_map))

def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[1,2])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [41]:
model_init()

Downloading (…)lve/main/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at Davlan/xlm-roberta-base-finetuned-arabic were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-finetuned-arabic and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [42]:
label_list = list(train_set["dialect"].unique())
#sentence_list = ['DZ', 'TN', 'MA', 'LY']
#ew_label_list = [sentence_list[label] for label in label_list]
print(label_list)
print(train_set["dialect"].value_counts())

data_set = Dataset( "OLY", train_set, evaluation_set, label_list )

# Define the list of class names
#label_list = ['DZ', 'TN', 'MA', 'LY']

# Create a label map that maps class names to their corresponding indices
label_map = {v: i for i, v in enumerate(label_list)}

# Print the label map
print(label_map)

train_dataset = BERTModelDataset(train_set["tweets"].to_list(),
                                 train_set["dialect"].to_list(),Model_Used,int(Max_Len),label_map)

evaluation_dataset = BERTModelDataset(evaluation_set["tweets"].to_list(),
                                      evaluation_set["dialect"].to_list(),Model_Used,int(Max_Len),label_map)

test_dataset = BERTModelDataset(test_set["tweets"].to_list(),
                                      test_set["dialect"].to_list(),Model_Used,int(Max_Len),label_map)

['MA', 'DZ', 'TN', 'LY']
LY    26499
DZ    26399
TN    26313
MA    26270
Name: dialect, dtype: int64
{'MA': 0, 'DZ': 1, 'TN': 2, 'LY': 3}


Downloading (…)okenizer_config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [46]:
#define training arguments
training_args = TrainingArguments("./train")
training_args.lr_scheduler_type = 'cosine'
training_args.evaluate_during_training = True
training_args.adam_epsilon =1e-8 
training_args.learning_rate = 2e-05
training_args.fp16 = True
training_args.per_device_train_batch_size = 64
training_args.per_device_eval_batch_size = 32
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 4
training_args.warmup_steps = 0 
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.seed = 42 
training_args.disable_tqdm = False

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [47]:
training_args.dataloader_pin_memory = False
gc.collect()
torch.cuda.empty_cache()
set_seed(Rand_Seed) 

trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset= evaluation_dataset,
    compute_metrics=compute_metrics
)



print(training_args.seed)

Some weights of the model checkpoint at Davlan/xlm-roberta-base-finetuned-arabic were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-finetuned-arabic and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 

42


In [48]:
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()



Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy
0,0.9973,0.539845,0.799367,0.790462,0.802016,0.798679,0.799194
2,0.461,0.357169,0.874457,0.874833,0.876946,0.874225,0.875262
2,0.3627,0.305231,0.897483,0.903279,0.898911,0.897444,0.898362
3,0.2658,0.290319,0.905437,0.908586,0.906513,0.905312,0.906151




              precision    recall  f1-score   support

           0       0.85      0.82      0.83      4706
           1       0.73      0.82      0.78      4695
           2       0.81      0.80      0.81      4741
           3       0.81      0.76      0.78      4473

    accuracy                           0.80     18615
   macro avg       0.80      0.80      0.80     18615
weighted avg       0.80      0.80      0.80     18615





              precision    recall  f1-score   support

           0       0.90      0.91      0.90      4706
           1       0.84      0.88      0.86      4695
           2       0.87      0.91      0.89      4741
           3       0.90      0.80      0.85      4473

    accuracy                           0.88     18615
   macro avg       0.88      0.87      0.87     18615
weighted avg       0.88      0.88      0.87     18615





              precision    recall  f1-score   support

           0       0.89      0.94      0.91      4706
           1       0.88      0.89      0.89      4695
           2       0.91      0.93      0.92      4741
           3       0.91      0.83      0.87      4473

    accuracy                           0.90     18615
   macro avg       0.90      0.90      0.90     18615
weighted avg       0.90      0.90      0.90     18615





              precision    recall  f1-score   support

           0       0.92      0.93      0.93      4706
           1       0.88      0.91      0.89      4695
           2       0.91      0.94      0.93      4741
           3       0.91      0.84      0.88      4473

    accuracy                           0.91     18615
   macro avg       0.91      0.91      0.91     18615
weighted avg       0.91      0.91      0.91     18615



TrainOutput(global_step=3296, training_loss=0.4803632139002235, metrics={'train_runtime': 2414.7626, 'train_samples_per_second': 174.727, 'train_steps_per_second': 1.365, 'total_flos': 2.0371925305011696e+16, 'train_loss': 0.4803632139002235, 'epoch': 4.0})

In [49]:
# Evaluate test set
test_results = trainer.predict(test_dataset)

# Print test set results
print(test_results.metrics)



              precision    recall  f1-score   support

           0       0.91      0.93      0.92      5523
           1       0.88      0.91      0.89      5405
           2       0.90      0.94      0.92      5445
           3       0.92      0.84      0.88      5527

    accuracy                           0.91     21900
   macro avg       0.91      0.91      0.90     21900
weighted avg       0.91      0.91      0.90     21900

{'test_loss': 0.2902698218822479, 'test_macro_f1': 0.9049767961912552, 'test_macro_f1_pos_neg': 0.9078652257387383, 'test_macro_precision': 0.9056707497278693, 'test_macro_recall': 0.9054416786293813, 'test_accuracy': 0.905296803652968, 'test_runtime': 38.6573, 'test_samples_per_second': 566.517, 'test_steps_per_second': 17.72}


In [50]:
# Save the trained model to a file
torch.save(trainer.model.state_dict(), "trainer.pth")

In [54]:
import torch
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained(Model_Used, num_labels=4).to(device)

# Load the trained model from disk
model_state_dict = torch.load('/content/trainer.pth')
model.load_state_dict(model_state_dict)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(Model_Used)



# Tokenize the input sentence
#input_sentence = "نبي نروح للحوش "
input_sentence  = "شوكران علا هاد "
#input_sentence  = "فرحان و نتشرف الي انا من اقلية الشعب الي عمري ما صدقت كذبة الثورة و الديمقراطية و ما عداها عليا حتى سياسي ملي حكمو من 2011 و ما تحكموش في تفكيري باجات فيسبوك و اعلاميين مرتزقة و سبر اراء غالط"
tokenized_input = tokenizer(input_sentence, padding=True, truncation=True, max_length=Max_Len, return_tensors='pt')
input_ids = tokenized_input['input_ids'].to(device)
attention_mask = tokenized_input['attention_mask'].to(device)

# Pass the input tensor through the model to obtain the predictions
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predicted_labels = torch.argmax(outputs.logits, dim=1)
confidence_score = torch.softmax(outputs.logits, dim=1)[0][predicted_labels]

# Print the predicted class and its corresponding label
#{'MA': 0, 'DZ': 1, 'TN': 2, 'LY': 3}

label_map = {0: "MA", 1: "DZ", 2: "TN", 3: "LY"}
predicted_class = label_map[predicted_labels.item()]
print("Predicted class:", predicted_class)
print("Confidence score:", confidence_score.item())


Some weights of the model checkpoint at Davlan/xlm-roberta-base-finetuned-arabic were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-finetuned-arabic and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 

Predicted class: MA
Confidence score: 0.9967768788337708
