This notebook contains finetuning an [AraBERT variant](https://huggingface.co/aubmindlab/bert-large-arabertv02-twitter), which is pre-trained on Arabic Dialects from tweets, for Arabic Sentiment Analysis. 

It achieved state-of-the-art performance and the first place solution on a Kaggle university-wide NLP competition.

![Kaggle competition leaderboard](leaderboard.png  "First place on a university-wide Kaggle competition")

### Import necessary Libraries 

In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install farasapy
!pip install pyarabic
!pip install arabert

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from arabert.preprocess import ArabertPreprocessor
from torch.utils.data import Dataset, DataLoader 
from torch.utils.data import Dataset
import torch
import pandas as pd

### GPU 

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Preparing the pre-trained model

In [None]:
model_name = "aubmindlab/bert-large-arabertv02-twitter"
tokenizer = BertTokenizer.from_pretrained(model_name)
# Labels = {-1, 0, 1}
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device) 
arabert_prep = ArabertPreprocessor(model_name=model_name)

In [None]:
class SimpleDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels=None):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        data = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
        }
        if self.labels is not None:
            data['labels'] = self.labels[idx]
        return data

In [None]:
def prepare_dataset(df, tokenizer, max_len=64, include_labels=True):
    input_ids = []
    attention_masks = []
    labels = []

    for _, row in df.iterrows():
        encoded_data = tokenizer.encode_plus(
            row['review_description'],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])

        # Add label if 'rating' column is present and include_labels is True
        if include_labels and 'rating' in df.columns:
            labels.append(row['rating'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    if labels:
        labels = torch.tensor(labels, dtype=torch.long)
        return SimpleDataset(input_ids, attention_masks, labels)
    else:
        return SimpleDataset(input_ids, attention_masks, None)

### Loading training dataset

In [None]:
train_df = pd.read_excel("train.xlsx")

In [None]:
# It is required for the labels to start from 0
train_df['rating'] = train_df['rating']+1

In [None]:
train_df['review_description'] = train_df['review_description'].apply(arabert_prep.preprocess)

In [None]:
train_dataset = prepare_dataset(train_df, tokenizer, include_labels=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

### Finetuning the model

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_loader.dataset
)

In [None]:
trainer.train()

### Loading test dataset

In [None]:
test_df = pd.read_csv("test.csv")

In [None]:
test_df['review_description'] = test_df['review_description'].apply(lambda x: arabert_prep.preprocess(x))

In [None]:
test_dataset = prepare_dataset(test_df, tokenizer, include_labels=False)
test_loader = DataLoader(test_dataset, batch_size=16)

### Rating prediction

In [None]:
def predict_sentiments(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.tolist())

    return predictions

In [None]:
test_predictions = predict_sentiments(model, test_loader)

In [None]:
test_df['rating'] = test_predictions

In [None]:
test_df.head()

In [None]:
test_df.to_csv('predicted_arabert_twitter.csv', index=False)