In [1]:
!pip install transformers
!pip install farasapy
!pip install pyarabic
!pip install arabert
!git clone https://github.com/aub-mind/arabert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 34.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 81.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully inst

In [2]:
!pip install emoji 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from arabert.preprocess import ArabertPreprocessor

     

In [4]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [5]:
from transformers import AutoTokenizer, AutoModel

In [6]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"

In [7]:
df_train = pd.read_csv("train.csv")


In [8]:
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=True)
df_train['text']=df_train['text'].apply(arabert_prep.preprocess)

In [9]:
possible_labels = df_train.stance.unique()

In [64]:
print(possible_labels)

[ 1  0 -1]


In [10]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [11]:
df_train.stance = df_train['stance'].map(label_dict)

In [56]:
print(label_dict)

{1: 0, 0: 1, -1: 2}


In [12]:
y_train=df_train.stance.values
print(len(y_train))
x_train=df_train.text.values
print(len(x_train))

6988
6988


In [13]:
df_val = pd.read_csv("dev.csv")

In [14]:
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=True)
df_val['text']=df_val['text'].apply(arabert_prep.preprocess)

In [15]:
df_val.stance = df_val['stance'].map(label_dict)

In [16]:
y_val=df_val.stance.values
print(len(y_val))
x_val=df_val.text.values
print(len(x_val))

1000
1000


# BERT CLASS

In [85]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        # D_in = 256 if version == "mini" else 768
        # H, D_out = 50, 2
        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")
        # Instantiate an one-layer feed-forward classifier
        #----------- classifier ---------
        self.classifier = nn.Sequential(
            nn.Linear(768,50),    #768 bert output => linear input
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(50, 3) # no of classes
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
          # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
                              
          
          # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

          # Feed input to classifier to compute logits
          # feed el hidden layer embedding to the classifier layer
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 52 µs, sys: 0 ns, total: 52 µs
Wall time: 57 µs


# get input IDs and attention masks


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/751k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
def getIDs_attention(data):
  # Create empty lists to store outputs
  input_ids = []
  attention_masks = []
  #tokenizer = AutoTokenizer.from_pretrained(model_name) if version == "mini" else AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")

  # For every sentence...
  for i,sent in enumerate(data):
      # `encode_plus` will:
      #    (1) Tokenize the sentence
      #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
      #    (3) Truncate/Pad sentence to max length
      #    (4) Map tokens to their IDs
      #    (5) Create attention mask
      #    (6) Return a dictionary of outputs
      encoded_sent = tokenizer.encode_plus(
          text=sent,  # Preprocess sentence
          add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
          max_length=64,                  # Max length to truncate/pad
          padding='max_length',        # Pad sentence to max length
          #return_tensors='pt',           # Return PyTorch tensor
          return_attention_mask=True,     # Return attention mask
          truncation = True 
          )
      
      # Add the outputs to the lists
      input_ids.append(encoded_sent.get('input_ids'))
      attention_masks.append(encoded_sent.get('attention_mask'))
    # Convert lists to tensors
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)

  return input_ids, attention_masks

        

In [20]:
# get ids and mask attentions for train and val data to feed them to the model
train_inputs, train_masks = getIDs_attention(x_train)
val_inputs, val_masks = getIDs_attention(x_val)

In [21]:
print(len(train_masks))

6988


In [22]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
print(len(train_data))
print(len(train_inputs))
print(len(train_masks))
print(len(train_labels))

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

6988
6988
6988
6988


# model

In [86]:
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.optim import SparseAdam, Adam
def initialize_model(epochs=4, version="mini"):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=True)
    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(params=list(bert_classifier.parameters()),
                      lr=2e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    # total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    # scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, # Default value
    #                                             num_training_steps=total_steps)
    return bert_classifier, optimizer 

In [87]:
import random
import time
import torch
import torch.nn as nn
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer,train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
             
           
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
          
            # Zero out any previously calculated gradients
            

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            optimizer.zero_grad()
            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            #scheduler.step()
            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
      
        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")

# evaluation

In [25]:
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

# Initialize and train model

In [None]:
set_seed(42) 
bert_classifier, optimizer = initialize_model(epochs=50)
train(bert_classifier,optimizer, train_dataloader, val_dataloader, epochs=50, evaluation=True)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   1.275965   |     -      |     -     |   1.26   
   1    |   40    |   1.114532   |     -      |     -     |   1.18   
   1    |   60    |   1.004542   |     -      |     -     |   1.17   
   1    |   80    |   0.850811   |     -      |     -     |   1.18   
   1    |   100   |   0.828939   |     -      |     -     |   1.19   
   1    |   120   |   0.745436   |     -      |     -     |   1.20   
   1    |   140   |   0.692210   |     -      |     -     |   1.20   
   1    |   160   |   0.690721   |     -      |     -     |   1.22   
   1    |   180   |   0.747411   |     -      |     -     |   1.22   
   1    |   200   |   0.705672   |     -      |     -     |   1.22   
   1    |   220   |   0.694242   |     -      |     -     |   1.22   
   1    |   240   |   0.550650   |     -      |     -     |   1.23   


In [90]:
# store the model in pickle file
import pickle
filename = 'arabert_model.sav'
pickle.dump(bert_classifier, open(filename, 'wb'))

In [28]:
# # Loading the model (to avoid retraining in reruns)

# import pickle
# filename = 'trained_model_mini_with_emojis.sav'
# f = open(filename, 'rb')
# bert_classifier = pickle.load(f)

In [91]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    #model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        
        all_logits.append(logits)
         # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)
    # print(all_logits.shape)
    # print(all_logits)
    pred_labels=[]
    for log in all_logits:
      #print(len(log))
      highest_score=torch.argmax(log)
      #print(highest_score.item())
       
      pred_labels.append(highest_score)
      
      
    # Apply softmax to calculate probabilities
    #probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return pred_labels

In [67]:
print(y_val==val_labels)

False


In [92]:
# Compute predicted probabilities on the validation set
y_pred = bert_predict(bert_classifier, val_dataloader)
print(len(y_val))
print(len(y_pred))
print(classification_report(y_val, torch.tensor(y_pred)))
# Evaluate the Bert classifier
#evaluate_roc(probs, y_val)

1000
1000
              precision    recall  f1-score   support

           0       0.85      0.97      0.90       804
           1       0.57      0.30      0.39       126
           2       0.46      0.09      0.14        70

    accuracy                           0.82      1000
   macro avg       0.63      0.45      0.48      1000
weighted avg       0.79      0.82      0.79      1000

