In [None]:
!pip install transformers
!pip install farasapy
!pip install pyarabic
!pip install arabert
!git clone https://github.com/aub-mind/arabert

: 

In [None]:
!pip install emoji 

: 

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from arabert.preprocess import ArabertPreprocessor

     

: 

In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

: 

In [None]:
from transformers import AutoTokenizer, AutoModel

: 

# constants

In [None]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
hidden_state_size=50 # size of hidden layer in the classification layer above bert
n_classes= 3 # 3 for stance,make it 10 for cat
class_name="stance" # change to "category" for category classification

# loading and preprocessing train and validation data

In [7]:
df_train = pd.read_csv("train.csv")


In [8]:
# preprocess the train data using bert preprocessor
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=True)
df_train['text']=df_train['text'].apply(arabert_prep.preprocess)

In [9]:
#extract unique labels
possible_labels = df_train.stance.unique()

In [11]:
# map possible labels to +ve numbers from 0 to n_classes-1
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [12]:
# update with the new labels
df_train.stance = df_train['stance'].map(label_dict)

In [14]:
#store train tweets and labels in lists
y_train=df_train.stance.values
x_train=df_train.text.values

6988
6988


 # Repeat the previous steps for the validation data

In [15]:
df_val = pd.read_csv("dev.csv")

In [16]:
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=True)
df_val['text']=df_val['text'].apply(arabert_prep.preprocess)

In [17]:
df_val.stance = df_val['stance'].map(label_dict)

In [18]:
y_val=df_val.stance.values
x_val=df_val.text.values

1000
1000


# BERT CLASS

In [19]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    
    def __init__(self, freeze_bert=False):
        """
        input parameters
        bert: a BertModel object
        classifier: NN classifier layer above bert
        freeze_bert: boolean we make it false for bert fine tunig
        """
        super(BertClassifier, self).__init__()
        
        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")
        # Instantiate an one-layer feed-forward classifier
        #----------- classifier ---------
        """ 
        this layer make it more complex to be able to behave better inshallah
        first NN linear layer take 768 bert's output and outs hidden state with size 50
        we can change this size but it was recommended to use it as 50,
        the 2 layers RELU AND dropout 
        then another linear layer take the hidden and out the classes 
        """
        self.classifier = nn.Sequential(

            nn.Linear(768,hidden_state_size),    
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_state_size, n_classes) # no of classes
        )

        # Freeze bert model to enable fine tuning and let the model train on our own data
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute predicted probabilities.
        input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        attention_mask (torch.Tensor):
         a tensor that hold attention mask(pay attention to the most important part in the input)
                      information with shape (batch_size, max_length)
        
        """
          # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # feed el hidden layer embedding to the classifier layer
        probs = self.classifier(last_hidden_state_cls)

        return probs

CPU times: user 32.6 ms, sys: 5.55 ms, total: 38.2 ms
Wall time: 38.8 ms


# get input IDs and attention masks


In [20]:
# download tokenizer of the model to use it to extarct ids and attention masks of data
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/751k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [21]:
def getIDs_attention(data):
 
  IDs = [] #store ids here
  attenMasks = [] # store attention masks here
  #tokenizer = AutoTokenizer.from_pretrained(model_name) if version == "mini" else AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")

  # loop on all tweets
  for i,sent in enumerate(data):
      # `encode_plus` will do the follwoing:
      #    Tokenize the sentence
      #    Pad sentence to max length
      #    Map tokens to their IDs
      #    Create attention mask
      #    Return a dictionary of outputs
      encoded_sent = tokenizer.encode_plus(
          text=sent,  # Preprocess sentence
          add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
          max_length=64,                  # Max length to truncate/pad
          padding='max_length',        # Pad sentence to max length  ( recommended 64)
          return_attention_mask=True,     # Return attention mask
          truncation = True 
          )
      
      # add ids of the tweet in the IDs list
      IDs.append(encoded_sent.get('input_ids'))
      # add attention masks of the tweet in the IDs list
      attenMasks.append(encoded_sent.get('attention_mask'))
    # make the lists tensors
  IDs = torch.tensor(IDs)
  attenMasks = torch.tensor(attenMasks)

  return IDs, attenMasks

        

In [22]:
# get ids and mask attentions for train and val data to feed them to the model
train_inputs, train_masks = getIDs_attention(x_train)
val_inputs, val_masks = getIDs_attention(x_val)

In [24]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#convert labels to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

"""
use ids and attention masks to create dataLoader for training 
and validation sets
"""
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

6988
6988
6988
6988


# model

In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.optim import SparseAdam, Adam
def initialize_model(epochs=4, version="mini"):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=True)
    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # optimizer that will perform back propagation and compute loss
    optimizer = AdamW(params=list(bert_classifier.parameters()),
                      lr=0.001,    # learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    #Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer ,scheduler

In [26]:
import random
import time
import torch
import torch.nn as nn
# use loss entropy as our loss function
loss_fn = nn.CrossEntropyLoss()

def train(model, optimizer,scheduler,train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
   
    # training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)            

            # feed ids and attention mash to the model
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            optimizer.zero_grad()
            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()
            
            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
      
        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
       
    print("Training complete!")

# Initialize and train model

In [46]:
bert_classifier, optimizer ,scheduler= initialize_model(epochs=40)
train(bert_classifier,optimizer,scheduler, train_dataloader, val_dataloader, epochs=40)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.739872   |     -      |     -     |   1.26   
   1    |   40    |   0.582242   |     -      |     -     |   1.17   
   1    |   60    |   0.567579   |     -      |     -     |   1.19   
   1    |   80    |   0.490767   |     -      |     -     |   1.19   
   1    |   100   |   0.529208   |     -      |     -     |   1.19   
   1    |   120   |   0.454230   |     -      |     -     |   1.20   
   1    |   140   |   0.491373   |     -      |     -     |   1.22   
   1    |   160   |   0.544625   |     -      |     -     |   1.22   
   1    |   180   |   0.534463   |     -      |     -     |   1.22   
   1    |   200   |   0.533159   |     -      |     -     |   1.22   
   1    |   220   |   0.509320   |     -      |     -     |   1.22   
   1    |   240   |   0.431907   |     -      |     -     |   1.22   


In [47]:
# store the model in pickle file
import pickle
filename = 'arabert_model.sav'
pickle.dump(bert_classifier, open(filename, 'wb'))

In [49]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    all_probabilities = []

    # loop on batches of test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute probabilities
        with torch.no_grad():
            probabilities = model(b_input_ids, b_attn_mask)
        
        all_probabilities.append(probabilities)
    # Concatenate logits from each batch
    all_probabilities = torch.cat(all_probabilities, dim=0)
  
    # take the class with highest score  
    pred_labels=[]
    for log in all_probabilities:
      highest_score=torch.argmax(log)
       
      pred_labels.append(highest_score)

    return pred_labels

In [51]:
# Compute predicted probabilities on the validation set
y_pred = bert_predict(bert_classifier, val_dataloader)
print(classification_report(y_val, torch.tensor(y_pred)))


1000
1000
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       804
           1       0.51      0.32      0.39       126
           2       0.61      0.27      0.38        70

    accuracy                           0.82      1000
   macro avg       0.66      0.51      0.56      1000
weighted avg       0.80      0.82      0.80      1000



# TEST TWEETS

In [35]:
df_test = pd.read_csv("test.csv")

In [36]:
#preprocess test data
df_test['text']=df_test['text'].apply(arabert_prep.preprocess)

In [37]:
x_test=df_test.text.values

In [38]:
test_inputs, test_masks = getIDs_attention(x_test)

In [39]:
# Create the DataLoader for our test data
test_data = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [40]:

test_labels_cat = bert_predict(bert_classifier, test_dataloader)


In [41]:
test_labels=[]
for label in test_labels_cat:
  test_labels.append(label.item())


In [42]:
#map labels
for i in range(0,len(test_labels)):
  if test_labels[i] == 0:
    test_labels[i]=1
  elif test_labels[i] == 1:
    test_labels[i]=0
  elif test_labels[i] ==2:
    test_labels[i]=-1

In [44]:
# write the predicted data to a csv file
d = {'stance': test_labels}
test_csv = pd.DataFrame(data=d, columns=['stance'])
test_csv.to_csv('test_result.csv')