In [14]:
import torch
print(torch.cuda.is_available()) 
print(torch.cuda.current_device())  


True
0


In [15]:
import torch

if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3080 Ti Laptop GPU
Sat Jan 18 17:57:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   47C    P8             13W /  131W |     862MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+--------

In [1]:
import numpy as np
import pandas as pd
import pyarabic.araby as ar

import re , emoji, Stemmer, functools, operator, string
import torch , optuna, gc, random, os

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample

import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
st =  Stemmer.Stemmer('arabic')
def data_cleaning (text):
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"https\S+", "", text)
  text = re.sub(r'\s+', ' ', text)
  text = re.sub("(\s\d+)","",text) 
  text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
  text = re.sub("\d+", " ", text)
  text = ar.strip_tashkeel(text)
  text = ar.strip_tatweel(text)
  text = text.replace("#", " ");
  text = text.replace("@", " ");
  text = text.replace("_", " ");
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)
  em = text
  em_split_emoji = emoji.get_emoji_regexp().split(em)
  em_split_whitespace = [substr.split() for substr in em_split_emoji]
  em_split = functools.reduce(operator.concat, em_split_whitespace)
  text = " ".join(em_split)
  text = re.sub(r'(.)\1+', r'\1', text)
  text_stem = " ".join([st.stemWord(i) for i in text.split()])
  text = text +" "+ text_stem
  text = text.replace("آ", "ا")
  text = text.replace("إ", "ا")
  text = text.replace("أ", "ا")
  text = text.replace("ؤ", "و")
  text = text.replace("ئ", "ي")
   
  return text

In [4]:

Use_Train_Extended_Data = False 

Tweets_Ids_Col_Train ="Tweet_ID"
Tweets_Text_Col_Train = "Tweet"
Tweets_Sentiment_Col_Train = "Label"
Train_Data_File = "train_data.csv"

train_data = pd.DataFrame()


train_data = pd.read_csv(Train_Data_File, sep=",")

print(train_data[Tweets_Sentiment_Col_Train].value_counts())
print(train_data.value_counts())

Label
OBJ        5233
NEG        1317
NEUTRAL     654
POS         627
Name: count, dtype: int64
Tweet_ID  Tweet                                                                                                       Label  
1         #الرياض #أمطار_الرياض #السعودية #مطر #الرياض_تغرق #أمطار_الرياض #قحاب                                       OBJ        1
5232      _:  #التأسيسية " على وشك الانفجار بعد عودة التهديد بالانسحاب "                                              OBJ        1
5230      بطاقة دعوة #زواج حلوه لـ #الواتس_اب من تصميمنا رتويت ي حبايبي :$                                            POS        1
5229      د عبدالمنعم سعيد: إسرائيل تطالب العالم بالاعتراف بفاشيتها وعنصريتها قانون قسم الولاء ليهودية إسرائيل فاشية  NEG        1
5228      ماعنديش مشكلة نقبل او نرفض القرض على اسس اقتصادية،لكن انك تلوي الدين عشان تحلل و تحرم هو ده اللي ينرفز      NEG        1
                                                                                                                           

In [5]:
train_data[Tweets_Text_Col_Train] = train_data[Tweets_Text_Col_Train].apply(lambda x:   data_cleaning(x))

# Removing un-needed feilds
if Tweets_Ids_Col_Train in train_data.columns:
  del train_data[Tweets_Ids_Col_Train]
train_data.columns = [Tweets_Text_Col_Train, Tweets_Sentiment_Col_Train]

train_data[Tweets_Text_Col_Train].head(50)

0     الرياض امطار الرياض السعودية مطر الرياض تغرق ا...
1     من بين الاسماء محمود القرش، مصر الجديدة واخرين...
2     مقتل طفلة لبنانية في تبادل اطلاق نار بين علوين...
3        الروح و الجسد مصطفي محمود روح و جسد مصطف محمود
4     دعبد المنعم ابو الفتوح فى عيون العالم دعبد منع...
5     عندما يستمر القضاء في التحقيق في بلاغات عبثية ...
6     جدول اعادة اخر كلام الاثنين يونيو حلقة جمعة ال...
7     مرسي بيغير من باسم يوسف عشان باسم الناس بتضحك ...
8     اذا استمرت القيادة المصرية الحالية في نهجها فس...
9     مش حتقولي ازاي اشجع الكورة لجماهير مش حتقول از...
10    سامي الجابر يتحمل كل شيء لانه اعطي الصلاحية كا...
11    محمد بديع اطلب من ابي محمد حسني مبارك ان يرفع ...
12       بوستر فيلم الشتا االي فات وستر يلم شتا اال فات
13    تراجع تصنيف مصر لمركز الاخير عالميا فى مستوى ا...
14                                   حزب الكنبة حزب كنب
15    استمعت لعمال لا حق في انشاء نقابات مستقلة، ملي...
16    وحياة قلبي وافراحه️ الشباب الاهلي حيا قلب افرا...
17    مشاهد من مسرح حكم الفرد الجزء الاول مقال ا

In [6]:
Extra_Len = 6 
Max_Len = train_data[Tweets_Text_Col_Train].str.split().str.len().max() + Extra_Len
print(Max_Len)

#Spliting the Training data
Test_Size = 0
if Use_Train_Extended_Data :
  Test_Size = 0.001  
                    
else :
  Test_Size = 0.0005 
                    
Rand_Seed = 42 

train_set, evaluation_set = train_test_split( train_data, test_size= Test_Size, random_state= Rand_Seed)

print("Train set: ")
print(train_set[Tweets_Sentiment_Col_Train].value_counts())
print("---------------------------")
print ("Evaluation set: ")
print (evaluation_set[Tweets_Sentiment_Col_Train].value_counts())



74
Train set: 
Label
OBJ        5230
NEG        1317
NEUTRAL     653
POS         627
Name: count, dtype: int64
---------------------------
Evaluation set: 
Label
OBJ        3
NEUTRAL    1
Name: count, dtype: int64


In [7]:

Tweets_Ids_Col_Test = "Tweet_ID"
Tweets_Text_Col_Test = "Tweet"
Test_Data_File = "test_data.csv"

test_data = pd.read_csv(Test_Data_File, sep=",")
test_data.columns = [Tweets_Ids_Col_Test,Tweets_Text_Col_Test]

test_data[Tweets_Text_Col_Test] = test_data[Tweets_Text_Col_Test].apply(lambda x:   data_cleaning(x))
test_data[Tweets_Text_Col_Test].head(50)

0     سعوديون يطالبون الجيش المصري باختراع علاج لكور...
1     النضافة ،النظام،الراحة و الحرية الي بيحس بيها ...
2     عرض الاسبوع الف متابع وش اكثر شي يسهرك صيام ال...
3     خمس الاف الامارات الهلال النصر السعودية الكويت...
4     حمزاوي لابد من اقالة وزير الداخلية ومدير امن ب...
5             غلطت م الاول رنا سماحه غلط م اول رنا سماح
6     و انما دغل يعج بالنبت على اختلاف اشكاله والاخت...
7     ياباسم عصام سلطان بيسال عليك وبيقول انت فين من...
8     لتحميل كتاب تغريد في السعادة و التفاول و الامل...
9     غير صحيح وموقفي الرافض سجل في اكثر من مقال بجر...
10    يوسف الحسيني لحزب النورخليكم في الدعوة وابعدوا...
11    الدكتور كمال الهلباوي يدعم ابوالفتوح دكتور كما...
12    الراتب مايكفي الحاجة الاتحاد الهلال النصر الاه...
13    فلبيني اسلم قبل يوما في مكتب الدعوة ، وذهب الى...
14    تاجيل قضية اعادة اجراءات محاكمة متهما فى مذبحة...
15    معلوماتى انك لا تشرب دايات رغم معلوماتي انك لا...
16    الدكتورة ايمان جمعة الاستاذة بكلية الاعلام جام...
17    جمعية زمزم تكفل بولادة توايم كن سباقا لعلا

In [214]:
Model_Used = "UBC-NLP/MARBERT"

#Model_Used = "xlm-roberta-large"

#Model_Used = "asafaya/bert-base-arabic"  # Example
#Model_Used = "aubmindlab/bert-base-arabertv2"  # Example

Task_Name = "classification"

class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list
        
class BERTModelDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTModelDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
  
    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
    
      encoded_review = self.tokenizer.encode_plus(
      text,
      max_length= self.max_len,
      add_special_tokens= True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation='longest_first',
      return_attention_mask=True,
      return_tensors='pt'
    )
      input_ids = encoded_review['input_ids'].to(device)
      attention_mask = encoded_review['attention_mask'].to(device)

      return InputFeatures(input_ids=input_ids.flatten(), attention_mask=attention_mask.flatten(), label=self.label_map[self.target[item]])

In [215]:
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(Model_Used, return_dict=True, num_labels=len(label_map))

def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[1,2])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


In [216]:


label_list = list(train_set[Tweets_Sentiment_Col_Train].unique())

print(label_list)
print(train_set[Tweets_Sentiment_Col_Train].value_counts())

data_set = Dataset( "KAUST", train_set, evaluation_set, label_list )

label_map = { v:index for index, v in enumerate(label_list) }
print(label_map)

train_dataset = BERTModelDataset(train_set[Tweets_Text_Col_Train].to_list(),
                                 train_set[Tweets_Sentiment_Col_Train].to_list(),Model_Used,Max_Len,label_map)

evaluation_dataset = BERTModelDataset(evaluation_set[Tweets_Text_Col_Train].to_list(),
                                      evaluation_set[Tweets_Sentiment_Col_Train].to_list(),Model_Used,Max_Len,label_map)



['OBJ', 'NEG', 'POS', 'NEUTRAL']
Label
OBJ        5230
NEG        1317
NEUTRAL     653
POS         627
Name: count, dtype: int64
{'OBJ': 0, 'NEG': 1, 'POS': 2, 'NEUTRAL': 3}


In [217]:
#define training arguments
training_args = TrainingArguments("./train")
#training_args.lr_scheduler_type = 'cosine'
training_args.lr_scheduler_type = 'linear'
training_args.evaluate_during_training = True
training_args.adam_epsilon =1e-8 

training_args.learning_rate = 1.8e-5 # use this with org data  
training_args.fp16 = True
#training_args.per_device_train_batch_size = 16 #64 
#training_args.per_device_eval_batch_size = 16 # 64 
training_args.per_device_train_batch_size = 32 
training_args.per_device_eval_batch_size =  32
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 10
#training_args.warmup_steps = 0 
training_args.warmup_steps = int(0.1 * (len(train_dataset) // training_args.per_device_train_batch_size) * training_args.num_train_epochs)
training_args.max_grad_norm = 1.0

#training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.evaluation_strategy = "steps"
training_args.eval_steps = 100  
training_args.logging_steps = 200
training_args.save_strategy = "steps"
training_args.save_steps = 500
training_args.early_stopping_patience = 3  # Stop if no improvement after 3 evaluations
training_args.load_best_model_at_end = True
#training_args.save_steps = 100000 
training_args.seed = 42 
training_args.disable_tqdm = False
training_args.optim = "adafactor"
training_args.weight_decay = 0.01

In [218]:
training_args.dataloader_pin_memory = False
gc.collect()
torch.cuda.empty_cache()
set_seed(Rand_Seed) 

trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset= evaluation_dataset,
    compute_metrics=compute_metrics
)

print(training_args.seed)

  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Some weights of the model checkpoint at UBC-NLP/MARBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSeque

42


  self.scaler = ShardedGradScaler() if self.sharded_dpp else torch.cuda.amp.GradScaler()


In [219]:
print(Max_Len)
print(training_args.learning_rate)
print(training_args.adam_epsilon)
print(training_args.warmup_steps)
trainer.train()

74
1e-05
1e-08
244


  with autocast():
 16%|█▋        | 200/1220 [00:42<03:37,  4.69it/s]
 16%|█▋        | 200/1220 [00:42<03:37,  4.69it/s]

{'loss': 0.9485, 'learning_rate': 8.19672131147541e-06, 'epoch': 1.64}


 33%|███▎      | 400/1220 [01:25<02:51,  4.79it/s]
 33%|███▎      | 401/1220 [01:25<02:56,  4.65it/s]

{'loss': 0.6262, 'learning_rate': 8.401639344262295e-06, 'epoch': 3.28}


 49%|████▉     | 600/1220 [02:09<02:11,  4.71it/s]
 49%|████▉     | 601/1220 [02:09<02:13,  4.64it/s]

{'loss': 0.4229, 'learning_rate': 6.352459016393443e-06, 'epoch': 4.91}


 66%|██████▌   | 800/1220 [02:52<01:32,  4.54it/s]
 66%|██████▌   | 800/1220 [02:52<01:32,  4.54it/s]

{'loss': 0.2656, 'learning_rate': 4.30327868852459e-06, 'epoch': 6.56}


 82%|████████▏ | 1000/1220 [03:44<00:48,  4.56it/s]
 82%|████████▏ | 1000/1220 [03:44<00:48,  4.56it/s]

{'loss': 0.203, 'learning_rate': 2.254098360655738e-06, 'epoch': 8.2}


 98%|█████████▊| 1200/1220 [04:50<00:06,  3.10it/s]
 98%|█████████▊| 1200/1220 [04:50<00:06,  3.10it/s]

{'loss': 0.1583, 'learning_rate': 2.0491803278688524e-07, 'epoch': 9.83}


100%|██████████| 1220/1220 [04:56<00:00,  3.23it/s]
100%|██████████| 1220/1220 [04:56<00:00,  4.11it/s]

{'train_runtime': 296.9246, 'train_samples_per_second': 4.109, 'epoch': 10.0}





TrainOutput(global_step=1220, training_loss=0.4324196907340503, metrics={'train_runtime': 296.9246, 'train_samples_per_second': 4.109, 'epoch': 10.0})

In [220]:
def predict(text, tokenizer):
 
  encoded_review = tokenizer.encode_plus(
    text,
    max_length=Max_Len,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True, #True,
    truncation='longest_first',
    return_attention_mask=True,
    return_tensors='pt'
  )

  input_ids = encoded_review['input_ids'].to(device) 
  attention_mask = encoded_review['attention_mask'].to(device)
    

  output = trainer.model(input_ids, attention_mask)
  _, prediction = torch.max(output[0], dim=1)
  return prediction[0]


tokenizer = AutoTokenizer.from_pretrained(Model_Used)

prediction_list = []
i = 0
for tweet in test_data[Tweets_Text_Col_Test]:
    id = test_data[Tweets_Ids_Col_Test][i]
  
    pre = predict(tweet, tokenizer)
    pre_txt = label_list[pre]
   
    if pre_txt == 'POS':
        prediction_list.append('POS')  # Positive sentiment
    elif pre_txt == 'NEG':
        prediction_list.append('NEG')  # Negative sentiment
    elif pre_txt == 'OBJ':
        prediction_list.append('OBJ')  # Objective or factual content
    elif pre_txt == 'NEUTRAL':
        prediction_list.append('NEUTRAL')  # Neutral sentiment or ambiguous context
    
    i += 1



In [221]:
import os

print("Current working directory:", os.getcwd())

directory_path = r'C:\Users\marwan\Desktop\nlp'

if not os.path.exists(directory_path):
    os.makedirs(directory_path)

result_file = os.path.join(directory_path, 'sub_test12.csv')

results = pd.DataFrame({'Tweet_ID': test_data[Tweets_Ids_Col_Test].astype(str), 'Label': prediction_list},
                       columns=['Tweet_ID', 'Label'])
print(results)

results.to_csv(result_file, sep=",", index=False)

print(f"File saved successfully to {result_file}")


Current working directory: c:\
     Tweet_ID    Label
0        1000  NEUTRAL
1        1001  NEUTRAL
2        1002      OBJ
3        1003      OBJ
4        1004      OBJ
...       ...      ...
1995     2995      OBJ
1996     2996      OBJ
1997     2997      NEG
1998     2998      OBJ
1999     2999      OBJ

[2000 rows x 2 columns]
File saved successfully to C:\Users\marwan\Desktop\nlp\sub_test12.csv
