In [None]:
import pandas as pd

In [None]:
# read the datasets
data1 = pd.read_csv('/content/ArSarcasm.csv')

data2 = pd.read_csv('/content/DL_cleaned_train.csv')

data3 = pd.read_csv('/content/Arabic_dialect.csv')

In [None]:
# craete dataframes of datasets
df1=pd.DataFrame(data1)
df2=pd.DataFrame(data2)
df3=pd.DataFrame(data3)

# 1- get insights of data
## -get info of data
## -count the null values
## -count the number of labels in each dataframe

In [None]:
df1.info

In [None]:
df2.info

In [None]:
df3.info

In [None]:
df1.isnull().sum()

In [None]:
df2.isnull().sum()

In [None]:
df3.isnull().sum()

In [None]:
df1['dialect'].value_counts()

In [None]:
df2['Dialect'].value_counts()


In [None]:
df3['result'].value_counts()

In [None]:
df1=df1[df1['dialect'].isin(['msa','gulf','levant'])] ## we don't need the egypt and magreb classes  bec we have enough in df2
df3 = df3[df3['result'].isin(['G'])] # we need just the gulf class of df3 to balance the number of instances in  ll labels

# 2- preprocessing data
## -drop missing values
## -lowercasing labels names in all dataframes
## -convert labels to numbers
## -in each dataframe keep text and dialect ids coulmns
## -combine the 3 dataframes
## -cleanning the text coulmn
## -delete short text





In [None]:
# drop the messing valuses in df3
df3=df3.dropna()

In [None]:
# converting all dialect (labels) in the three DataFrames to lowercase letters
df1['dialect'] = df1['dialect'].str.lower()
df2['Dialect'] = df2['Dialect'].str.lower()
df3['result'] = df3['result'].str.lower()

In [None]:
# map each label to a number
dialect_map = {
    'msa': 0,
    'eg': 1,
    'gulf': 2, 'g': 2,
    'levant': 3, 'lb': 3,
    'magreb': 4, 'ma': 4, 'ly': 4,
    'sd': 5  #
}



In [None]:
# convert the dilect classes to numbers(ids) using the previous map
df1['dialect_id'] = df1['dialect'].map(dialect_map)
df2['dialect_id'] = df2['Dialect'].map(dialect_map)
df3['dialect_id'] = df3['result'].map(dialect_map)

In [None]:
# only drop rows where dialect_id is missing
df1 = df1.dropna(subset=['dialect_id'])
df2 = df2.dropna(subset=['dialect_id'])
df3 = df3.dropna(subset=['dialect_id'])

In [None]:
# in each dataframe keep only the text and dialect_id
df1_final = df1[['tweet', 'dialect_id']]
df2_final = df2[['Text', 'dialect_id']]
df3_final = df3[['Tweet', 'dialect_id']]

In [None]:
# Rename 'tweet' column in df1 and df3 to 'Text'
df1_final = df1_final.rename(columns={'tweet': 'Text'})
df3_final = df3_final.rename(columns={'Tweet': 'Text'})

# Combine DataFrames into single dataframe
combined_df = pd.concat([df1_final, df2_final,df3_final], ignore_index=True)

# check result
print(combined_df.head())
print(combined_df['dialect_id'].value_counts())


In [None]:
# change name to final_df
final_df = combined_df

In [None]:
# install necessary libraries
!pip install farasapy
!pip install arabert
!pip install transformers datasets peft accelerate

In [None]:
import re
# function to clean the the Text column
def manual_clean(text):
  text=str(text)
  text=re.sub(r'@[\w_]+', ' ', text)
  text=re.sub(r'http\S+', ' ', text)
  text = re.sub(r'#\S+', '', text)       # remove hashtags
  text = re.sub(r'[a-zA-Z]+', ' ', text)# remove English letters
  text = re.sub(r'\d+', ' ', text)       # remove digits
  text = re.sub(r'[^\u0600-\u06FF\s]', ' ', text)  # keep Arabic only
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [None]:
# apply the previous function to the Text column
final_df['Text'] = final_df['Text'].apply(manual_clean)

In [None]:
from arabert.preprocess import ArabertPreprocessor

# create a preprocessor instance specific to AraBERTv2
arabert_prep = ArabertPreprocessor(model_name="aubmindlab/bert-base-arabertv2")

In [None]:
# apply the arabert preprocessing to the Text column
final_df['Text'] = final_df['Text'].apply(arabert_prep.preprocess)

In [None]:
# removing short tweets
final_df = final_df[final_df['Text'].str.len() > 5]

In [None]:
from datasets import Dataset

# convert a Pandas DataFrame (final_df) into a Hugging Face Dataset
# you should have a column named "text" and "label" (numeric label) to match Hugging Face Trainer expectations
dataset = Dataset.from_pandas(final_df[['Text', 'dialect_id']].rename(columns={
    'Text': 'text',
    'dialect_id': 'label'
}))


# 3- Training

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Specify the name of the pretrained AraBERT v2 model from Hugging Face Hub
model_name = "aubmindlab/bert-base-arabertv02"
# Load the tokenizer associated with the AraBERT v2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the pre-trained BERT model and add a classification head with 6 output labels
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)


In [None]:
# tokeniziation
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
# split dataset to train and eval sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]


In [None]:
from peft import get_peft_model, LoraConfig, TaskType

#prepares efficient fine-tuning of the AraBERT model using LoRA adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# inject LoRA adapters into the base model
model = get_peft_model(base_model, lora_config)
# print which parameters will be trained
model.print_trainable_parameters()


In [None]:
from sklearn.metrics import accuracy_score

# define a compute metrices function
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch


# disable wandb logging
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# compute class weights from training labels
labels = train_dataset["label"]
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# patch the compute_loss to include class weights
def custom_compute_loss(self, model, inputs, return_outputs=False, **kwargs):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fn = torch.nn.CrossEntropyLoss(weight=weights_tensor.to(logits.device))
    loss = loss_fn(logits, labels)
    return (loss, outputs) if return_outputs else loss

# patch the Trainer class
Trainer.compute_loss = custom_compute_loss

# define training arguments
training_args = TrainingArguments(
    output_dir="./arabert-dialect-lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    logging_steps=100,
    report_to="none"
)

# padding collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# train
trainer.train()


In [None]:
# save the model and the tokenizer
model.save_pretrained("./arabert-dialect-lora")
tokenizer.save_pretrained("./arabert-dialect-lora")


In [None]:
#zipping the model
!zip -r arabert-dialect-lora.zip arabert-dialect-lora


# 4- Loading the model and evaluate it

In [None]:
# load the fine-tuned model and unzipp it
!unzip /content/arabert-dialect-lora.zip -d arabert-dialect-lora


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig

peft_model_path = "/content/arabert-dialect-lora"

# load LoRA config
config = PeftConfig.from_pretrained(peft_model_path, local_files_only=True)

# load base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    num_labels=6
)

# load LoRA adapter
model = PeftModel.from_pretrained(base_model, peft_model_path, local_files_only=True)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path, local_files_only=True)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# create a trainer for evaluation

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

metrics = trainer.evaluate()

In [None]:
# print the accuarcy of the model
print(metrics["eval_accuracy"])