<a href="https://colab.research.google.com/github/NoufAlshenaifi/Rasid-at-StanceEval2024/blob/main/AraBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
import torch

In [2]:
#data = pd.read_csv("Mawqif_AllTargets_Train.csv")
data = pd.read_csv("V2_Mawqif_AllTargets_Train.csv")

In [3]:
# Split data into training and testing
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
data.head()

Unnamed: 0,ID,text,target,stance
0,1,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترن...,Women empowerment,Against
1,3,روح حلل محد يم تطعيم كورونا شف الحرم البارح م...,Covid Vaccine,Nothing
2,4,هذا ما يُعرّف بـ'فوبيا المرأة المُتمكنة' آفة ف...,Women empowerment,Favor
3,6,#LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في الم...,Digital Transformation,Favor
4,7,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Digital Transformation,Favor


In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'].tolist(), padding="max_length", truncation=True, max_length=128)  # Adjust max_length as needed


In [7]:
train_encodings = tokenize_function(train_data)
test_encodings = tokenize_function(test_data)

In [8]:
print(train_data['stance'].dtype)
print(train_data['stance'].unique())

object
['Favor' 'Against' 'Nothing']


In [9]:
label_mapping = {'Favor': 0, 'Against': 1, 'Nothing':2}
train_data['stance'] = train_data['stance'].map(label_mapping)
test_data['stance'] = test_data['stance'].map(label_mapping)

In [10]:
# Convert labels to tensor
train_labels = torch.tensor(train_data['stance'].values)
test_labels = torch.tensor(test_data['stance'].values)

In [11]:
# Define the dataset class
class ArabicStanceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = ArabicStanceDataset(train_encodings, train_labels)
test_dataset = ArabicStanceDataset(test_encodings, test_labels)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabert', num_labels=3)
#model = AutoModelForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabert', num_labels=3)

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
   # per_device_eval_batch_size=16,   # batch size for evaluation
   # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    #weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    #logging_steps=10,
)


In [15]:
# Define metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [16]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [17]:
# Assuming `train_data` and `test_data` are your datasets
# Check label values
print("Unique labels in training data:", train_data['stance'].unique())

# If labels are not integers or incorrectly formatted, convert them
#train_data['label_column_name'] = train_data['label_column_name'].apply(lambda x: int(x))
#test_data['label_column_name'] = test_data['label_column_name'].apply(lambda x: int(x))


Unique labels in training data: [0 1 2]


In [18]:
# Train the model
trainer.train()

Step,Training Loss
500,0.6971
1000,0.4289
1500,0.177


TrainOutput(global_step=1755, training_loss=0.3823511216035935, metrics={'train_runtime': 404.9938, 'train_samples_per_second': 34.581, 'train_steps_per_second': 4.333, 'total_flos': 921225853820160.0, 'train_loss': 0.3823511216035935, 'epoch': 5.0})

In [19]:
# Evaluate the model
train_results = trainer.evaluate(train_dataset)
test_results = trainer.evaluate(test_dataset)

print("Train Macro-F1 Score:", train_results['eval_f1'])
print("Test Macro-F1 Score:", test_results['eval_f1'])

Train Macro-F1 Score: 0.9899153690295478
Test Macro-F1 Score: 0.6031946982773208


In [None]:
#Train Macro-F1 Score: 0.9455224188694414
#Test Macro-F1 Score: 0.570815064823822

In [20]:
# Load the blind test data
blind_data = pd.read_csv("Mawqif_AllTargets_Blind Test.csv")


In [21]:
# Assume the blind test data has a column 'text' that needs to be predicted
# Tokenize the blind test data
blind_encodings = tokenizer(blind_data['text'].tolist(), padding="max_length", truncation=True, max_length=128)


In [22]:
class BlindDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [23]:
# Create a blind dataset
blind_dataset = BlindDataset(blind_encodings)

In [24]:
# Predict the labels for the blind test data
blind_predictions = trainer.predict(blind_dataset)

In [25]:
# Convert logits to probabilities and then to class labels
probabilities = torch.nn.functional.softmax(torch.tensor(blind_predictions.predictions), dim=-1)
predicted_labels = probabilities.argmax(dim=1)

In [26]:
# Map numeric labels back to string labels
label_mapping_inverse = {v: k for k, v in label_mapping.items()}
predicted_stances = [label_mapping_inverse[label.item()] for label in predicted_labels]


In [27]:
# Save or print predictions
blind_data['predicted_stance'] = predicted_stances
print(blind_data[['text', 'predicted_stance']])

                                                  text predicted_stance
0    الوضع صار بالسعوديه يفشل لا عاد فيه شركات صاحي...          Against
1       الفساد الإداري انهك البلاد ومازلنا نعتمد عل...            Favor
2    -البي بي سي ١٠ دول اوروبية توقف تطعيم اوكسفورد...          Against
3    منصة مدرستي بتسرع عملية التحول للتعليم الالكتر...            Favor
4                         انا مع تمكين المرأة اصلا URL            Favor
..                                                 ...              ...
614  التحول الإلكتروني دا معاه مخاطر إلكترونية كبير...          Against
615       ريم النجم:  تطعيم كورونا راح يحولكم زومبي !           Against
616  البيانات هي العنصر الرئيسي و المهم جداً في رحل...            Favor
617   تطعيم انفلونزا مناعة مؤقته تطعيم كورونا مناعة...            Favor
618  وطن يضع قادته صحة جميع من على ارضه أولوية فوق ...            Favor

[619 rows x 2 columns]


In [28]:
blind_data.to_csv('Mawqif_AllTargets_Blind_Test_Predictions.csv', index=False)