# Sentiment analysis using araBERT
## Installing Prerequisites

In [None]:
# !git clone https://github.com/aub-mind/arabert
%pip install -r requirements.txt

In [None]:
#Checking for GPU
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Imports

In [None]:
import pandas as pd
import numpy as np

from farasa.segmenter import FarasaSegmenter
from arabert.preprocess import ArabertPreprocessor, NEVER_SPLIT_TOKENS

preprocessor = ArabertPreprocessor("bert-base-arabert")
preprocess = preprocessor.preprocess
never_split_tokens = NEVER_SPLIT_TOKENS

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, BertForSequenceClassification, AutoTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Reading and Preparing Data

In [None]:
farasa_segmenter = FarasaSegmenter(interactive=True) # the arabet was built to use this segmenter

df_AJGT = pd.read_excel('AJGT.xlsx',header=0)

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

df_AJGT = df_AJGT[['Feed', 'Sentiment']]
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]

label_map = {
    'Negative' : 0,
    'Positive' : 1
}

df_AJGT[DATA_COLUMN] = df_AJGT[DATA_COLUMN].apply(lambda x: preprocess(x))
df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(lambda x: label_map[x])


train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2,random_state=42)

In [None]:
test_AJGT

In [None]:
train_sentence_length = [ len(text.split()) for text in train_AJGT['text']]
test_sentence_length = [ len(text.split()) for text in test_AJGT['text']]

print("Max of train len: ",np.max(train_sentence_length))
print("Max of test len: ",np.max(test_sentence_length))
import matplotlib.pyplot as plt

plt.hist(train_sentence_length,bins= range(200),density=True)
plt.hist(test_sentence_length,bins= range(200),density=True)
plt.show()

In [None]:
train_df = pd.DataFrame({
    'id':range(len(train_AJGT)),
    'label':train_AJGT["label"],
    'alpha':['a']*train_AJGT.shape[0],
    'text': train_AJGT["text"].replace(r'\n', ' ', regex=True)
})

dev_df = pd.DataFrame({
    'id':range(len(test_AJGT)),
    'label':test_AJGT["label"],
    'alpha':['a']*test_AJGT.shape[0],
    'text': test_AJGT["text"].replace(r'\n', ' ', regex=True)
})

!mkdir data
train_df.to_csv("data/train.tsv",index=False,columns=train_df.columns,sep='\t',header=False)
dev_df.to_csv("data/dev.tsv",index=False,columns=dev_df.columns,sep='\t',header=False)

## Model

That's it!! Now let's build our model

In [None]:
model_name = 'aubmindlab/bert-base-arabert' 
num_labels = 2
task_name = 'classification'
max_length = 128

In [None]:
config = AutoConfig.from_pretrained(model_name,num_labels=num_labels, output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, do_basic_tokenize=True, never_split=never_split_tokens)
tokenizer.max_len = max_length
model = BertForSequenceClassification.from_pretrained(model_name,config=config)

Now we need to convert the examples in the dataset to features that the model can understand

In [None]:
train_dataset = SingleSentenceClassificationProcessor(mode='classification')
test_dataset = SingleSentenceClassificationProcessor(mode='classification')

In [None]:
train_dataset.add_examples(texts_or_text_and_labels=train_df['text'],labels=train_df['label'],overwrite_examples = True)
test_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],labels=dev_df['label'],overwrite_examples = True)

print(train_dataset.examples[0])

In [None]:
train_features = train_dataset.get_features(tokenizer = tokenizer, max_length = max_length)
test_features = test_dataset.get_features(tokenizer = tokenizer, max_length = max_length)

In [None]:
print(train_features[0])

In [None]:
print(len(train_features))
print(len(test_features))

In [None]:
pos_train_labels = np.sum([feature.label for feature in train_features])
pos_test_labels = np.sum([feature.label for feature in test_features])

print(pos_train_labels)
print(pos_test_labels)

In [None]:
training_args = TrainingArguments("./train")

In [None]:
training_args.do_train = True
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 2e-5
training_args.warmup_steps = 0
training_args.per_gpu_train_batch_size = 16
training_args.per_gpu_eval_batch_size = 16
training_args.num_train_epochs= 5
training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1
training_args.save_steps = training_args.logging_steps 
training_args.seed = 42

In [None]:
print(training_args.logging_steps)

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  print("biiitch")
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  print(confusion_matrix(p.label_ids,preds))

  f1_Positive = f1_score(p.label_ids,preds,pos_label=1,average='binary')
  f1_Negative = f1_score(p.label_ids,preds,pos_label=0,average='binary')
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'f1_pos': f1_Positive,
      'f1_neg': f1_Negative,
      'macro_f1' : macro_f1, 
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [None]:
trainer = Trainer(model=model,
                  args = training_args,
                  train_dataset = train_features,
                  eval_dataset = test_features,
                  compute_metrics = compute_metrics)

In [None]:
trainer.train()

In [1]:
trainer.train()


              precision    recall  f1-score   support

           0       0.93      0.90      0.92       167
           1       0.92      0.94      0.93       193

    accuracy                           0.92       360
   macro avg       0.92      0.92      0.92       360
weighted avg       0.92      0.92      0.92       360
      
