In [None]:
!pip install -q transformers -U evaluate arabert #farasapy pyarabic emoji pystemmer optuna==2.3.0
! rm -rf BERT_hard
! mkdir -p BERT_hard

In [None]:
from arabert import preprocess 

In [None]:
import warnings, pandas as pd
warnings.simplefilter(action='ignore', category=FutureWarning)

data=pd.read_csv('HARD-reviews.tsv',sep='\t', header = 0 , encoding = 'utf-16')
#data = pd.read_csv("clean-HARD.csv", sep=',')
#data = data.drop('Unnamed: 0', axis=1)
#data = data.rename(columns={'sentences': 'review'})
data=data.drop(['nights','room type','user type','Hotel name','no'],axis=1)
data.head(5)


In [None]:
import re
def clean(text):
    text =re.sub(r'[a-zA-Z?]', '', text).strip()

    return text

data['sentences'] = data['review'].apply(lambda x:clean(x))
data.head(3)

In [None]:
data['rating'].value_counts()

In [None]:
data = data[['sentences','rating']]
data

In [None]:
from arabert.preprocess import ArabertPreprocessor

model_name="bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)


def arabertPreprocessor(text):
    text = arabert_prep.preprocess(text)
    return text

data['process_review'] = data['sentences'].apply(lambda x:arabertPreprocessor(x))
data

In [None]:
data['process_review'].isnull().values.any(), data['process_review'].isnull().sum()

In [None]:
data = data.dropna(subset = ['rating', 'process_review'])

In [None]:
data['process_review'].isnull().values.any(), data['process_review'].isnull().sum()

In [None]:
data['label'] = pd.Categorical(data.rating, ordered=True).codes
data['label'].unique()

In [None]:
mapLabels = pd.DataFrame(data.groupby(['rating', 'label']).count())

#drop count column
mapLabels.drop(['process_review'], axis = 1, inplace = True)
label2Index = mapLabels.to_dict(orient='index')

print (f"label2Index :{label2Index}")
print (type(label2Index))
#print (f"index2Label :{index2Label}")

In [None]:
index2label = {}

for key in label2Index:
    print (f"{key[1]} -> {key[0]}")
    index2label[key[1]] = key[0]

In [None]:
label2Index = {v: k for k, v in index2label.items()}

print (f'label2Index: {label2Index}')
print (f'index2label: {index2label}')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification ,BertModel
tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2') #UBC-NLP/MARBERT #aubmindlab/bert-base-arabertv2
model = BertForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabertv2', num_labels=4) #UBC-NLP/MARBERT

In [None]:
model

In [None]:
X = list(data["process_review"])
y = list(data["label"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels  

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="BERT_hard",
    num_train_epochs=3,
    logging_steps=5000,
    save_steps=5000,
    save_total_limit=3,
    evaluation_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics)

In [None]:
history = trainer.train()

In [None]:
eval_results = trainer.evaluate()

eval_ = pd.DataFrame().append(eval_results, ignore_index=True)
eval_ = eval_.rename(columns={"eval_loss": "Evaluation Loss", "eval_accuracy": "Evaluation Accuracy"})
eval_ = eval_[["Evaluation Loss", "Evaluation Accuracy"]] 
eval_.style.hide_index()

In [None]:
trainer.save_model("BERT_hard")