# Fine tuning Arabic Sentiment model

In [1]:
!nvidia-smi

Sun Dec 12 21:06:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers



Read the data that will be used in fine-tuning the model </br>
Data Source : https://paperswithcode.com/dataset/astd

In [3]:
import pandas as pd
df = pd.read_csv('../input/tweets/Tweets.txt', sep='\t',header=None,names=["text",'label'])


In [4]:
df.head()

Unnamed: 0,text,label
0,بعد استقالة رئيس #المحكمة_الدستورية ننتظر استق...,OBJ
1,أهنئ الدكتور أحمد جمال الدين، القيادي بحزب مصر...,POS
2,البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام ال...,NEG
3,#الحرية_والعدالة | شاهد الآن: #ليلة_الاتحادية ...,OBJ
4,الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقول...,NEUTRAL


In [5]:
df.dropna(inplace=True)
df.isna().sum()

text     0
label    0
dtype: int64

In [6]:
df.label.value_counts()


OBJ        6470
NEG        1642
NEUTRAL     805
POS         777
Name: label, dtype: int64

Convert the labels to number to be used in fine-tuning

In [7]:
def mapping(label):
  if label == 'OBJ' or label == 'NEUTRAL' :
    return 2
  elif label == 'POS':
    return 0
  else :
    return 1

In [8]:
df['label'] = df['label'].map(lambda x: mapping(x))
df.head()

Unnamed: 0,text,label
0,بعد استقالة رئيس #المحكمة_الدستورية ننتظر استق...,2
1,أهنئ الدكتور أحمد جمال الدين، القيادي بحزب مصر...,0
2,البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام ال...,1
3,#الحرية_والعدالة | شاهد الآن: #ليلة_الاتحادية ...,2
4,الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقول...,2


In [9]:
X = list(df["text"])
y = list(df["label"])

In [10]:
pip install transformers[sentencepiece]


Note: you may need to restart the kernel to use updated packages.


**Load Model Tokenizer** </br>
Model source : https://huggingface.co/CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment

In [11]:
model_checkpoint = "CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment" 
from transformers import AutoTokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/297k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Split the dataset into train , test , evalution 

In [12]:
from sklearn.model_selection import train_test_split

X_traindata, X_test, y_traindata, y_test = train_test_split(X, y, test_size=0.3,stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_traindata, y_traindata, test_size=0.2,stratify=y_traindata)

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)


Create function to convert pandas dataframe to torch dataset

In [13]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [14]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset = Dataset(X_test_tokenized, y_test)

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer ,EarlyStoppingCallback
args = TrainingArguments(
    output_dir="output/",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)


In [16]:
del df, X_train_tokenized , X_val_tokenized , X_train, X_val, y_train, y_val , X, y ,tokenizer

In [17]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

# Model Fine-tuning

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment")
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

) 
trainer.train() 
trainer.evaluate()

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 5428
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2037
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
500,0.568,0.539523
1000,0.3878,0.818674
1500,0.2515,1.133474
2000,0.1517,1.043453


***** Running Evaluation *****
  Num examples = 1357
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1357
  Batch size = 8
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1357
  Batch size = 8
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1357
  Batch size = 8
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget

{'eval_loss': 0.5395225882530212,
 'eval_runtime': 25.1419,
 'eval_samples_per_second': 53.974,
 'eval_steps_per_second': 6.762,
 'epoch': 3.0}

# Predict labels for test dataset

In [19]:
raw_pred, _, _ = trainer.predict(test_dataset)


***** Running Prediction *****
  Num examples = 2909
  Batch size = 8


In [20]:
import numpy as np
y_pred = np.argmax(raw_pred, axis=1)
y_pred

array([2, 2, 0, ..., 2, 2, 2])

In [21]:
def mappingtolabel(number):
  if number == 2 :
    return 'neutral'
  elif number == 1:
    return 'negative'
  else :
    return 'positive'

In [22]:
prediction_df = pd.DataFrame(list(zip(X_test,y_pred)), columns=['text','pred'])
prediction_df['pred'] =  prediction_df['pred'].map(lambda x: mappingtolabel(x))
prediction_df.head(10)

Unnamed: 0,text,pred
0,: د #أيمن_نور لـ #قناة_الشرق: رفضت لقاء #السيس...,neutral
1,كل عام وكل كادح في كل مكان في العالم بالف خير ...,neutral
2,انتهيت من أداء العمرة فإنه منظر عظيم أن تجد ال...,positive
3,"تأجيل محاكمة 77 من رافضي الانقلاب فى ""أحداث رم...",neutral
4,جانب من تجمع مسيرة #شبرا و #المطرية اليوم ضد ا...,neutral
5,"آخر كلام | أبو الفتوح: ""يجب أن يعلم أي نظام قا...",neutral
6,ألا يعلم الوزير المتدين أن الكذب حرام، وأن الل...,negative
7,شايفين العيب و الغلط ومبنسالش حتى ليه وكل واحد...,neutral
8,فوثق اللهم رابطتها وأدم ودها وأهدها سبلها وأمﻷ...,positive
9,الحل هوتوحيد المطالب إقالة النائب العام وتطهير...,neutral
