## Longformer [[Beltagy et al.]](https://arxiv.org/abs/2004.05150)

Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only attend "locally" to each other meaning that each token attends to its w/2 previous tokens and w/2 succeeding tokens with w being the window length. To increase the receptive field, the authors also applied dilation to the local window so they can increase the size of w without incurring in additional memory costs. This allows us to attend more than 512 tokens. </br>
In this tutorial, we will see an example of how to use Longformer for text classification.


<div>
<img src="Images/longformer.png"  width="600"/>
</div>

### Import Libraries

In [1]:

import numpy as np
import pandas as pd
import json, sys, regex
import torch
#import GPUtil
import torch.nn as nn
import shutil
from glob import glob
from shutil import copyfile
from tqdm import tqdm, trange
import os
from torch.nn import functional as F

import random

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
##----------------------------------------------------
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, LongformerConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import datasets

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)


def set_seed(seed):
    # Set the random seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

cpu


### Function for Tokenizing Train & Test Datasets

In [2]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, lab2ind, text_col_1 = 'sentence', text_col_2 = None, label_col = 'labels'):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text_col_1 = self.data[text_col_1]
        if(text_col_2 is None):
            self.text_col_2 = None
        else:
            self.text_col_2 = self.data[text_col_2]
        self.labels = self.data[label_col]
        self.max_len = max_len
        self.lab2ind = lab2ind
        
        self.isPair = True
        if(self.text_col_2 is None):
            self.isPair = False
            self.text_col_2 = self.data[text_col_1]

    def __len__(self):
        return len(self.text_col_1)

    def __getitem__(self, index):
        text_1 = str(self.text_col_1[index])     
        text_2 = str(self.text_col_2[index]) 
        
        label = self.labels[index]
        label = self.lab2ind[label]
        try:
            label = self.lab2ind[label]
        except:
            pass
        
        if(self.isPair):
            inputs = self.tokenizer.batch_encode_plus(
            [text_1, text_2],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        else:
            inputs = self.tokenizer.batch_encode_plus(
            [text_1],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            #return_token_type_ids=False, !!!!!!!!!
            truncation=True
        )

        if(self.isPair):
            dic = {
            'ids': torch.tensor(inputs.input_ids, dtype=torch.long),
            'mask': torch.tensor(inputs.attention_mask, dtype=torch.long),
            'token': torch.tensor(inputs.token_type_ids, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.long)
        }
        else:
            dic = {
            'ids': torch.tensor(inputs.input_ids, dtype=torch.long),
            'mask': torch.tensor(inputs.attention_mask, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.long)
        }
        
        return dic

### Function for Encoding Dataset

In [3]:
# define a function for data preparation
def regular_encode(file_path, tokenizer, lab2ind, shuffle=True, num_workers = 1, batch_size=64, maxlen = 32, mode = 'train', text_col_1 = 'sentence', text_col_2 = None, label_col = 'labels'):
    
    # if we are in train mode, we will load two columns (i.e., text and label).
    delimiter = None
    if(str(file_path).endswith('tsv')):
        delimiter = '\t'
    if mode == 'train':
        # Use pandas to load dataset
        df = pd.read_csv(file_path, delimiter=delimiter)
        custom_set = CustomDataset(df, tokenizer, maxlen,lab2ind, text_col_1 = text_col_1, text_col_2 = text_col_2, label_col = label_col)
    
    # if we are in predict mode, we will load one column (i.e., text).
    elif mode == 'evaluate':
        df = pd.read_csv(file_path, delimiter=delimiter)
        custom_set = CustomDataset(df, tokenizer, maxlen,lab2ind, text_col_1 = text_col_1, text_col_2 = text_col_2, label_col = label_col)
    else:
        print("the type of mode should be either 'train' or 'predict'. ")

        return
        
    print("{} Dataset: {}".format(file_path, df.shape))
    
    dataset_params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_workers': num_workers}

    batch_data_loader = DataLoader(custom_set, **dataset_params)
    
    return batch_data_loader

### Define Longformer Model and Tokenizer

In [4]:
# load tokenizer and model and define length of the text sequence
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 128)

model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window = 20, num_labels=2)

print(model.config)

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "architectures": [
    "LongformerForSequenceClassification"
  ],
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    20,
    20,
    20,
    20,
    20,
    20,
    20,
    20,
    20,
    20,
    20,
    20
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



### Define Train Function for Longformer

In [5]:
def train(model, iterator, optimizer, scheduler, isPair = False):
    
    model.train()

    epoch_loss = 0.0

    for _, batch in enumerate(tqdm(iterator, desc="Iteration")):
        
        input_ids = batch['ids'].to(device, dtype = torch.long)
        attention_mask = batch['mask'].to(device, dtype = torch.long)
        try:
            token_type_ids = batch['token'].to(device, dtype=torch.long)
        except:
            token_type_ids = None
        labels = batch['targets'].to(device, dtype = torch.long)

        batch_size = input_ids.shape[0]
        num_sent = input_ids.shape[1]

        if (len(input_ids.shape) == 3):
            input_ids = input_ids.view((-1, input_ids.size(-1)))
            attention_mask = attention_mask.view((-1, input_ids.size(-1)))
        if (token_type_ids and (len(token_type_ids.shape) == 3) and (isPair==True)):
            token_type_ids = token_type_ids.view((-1, input_ids.size(-1)))

        outputs = model(input_ids=input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, labels = labels)
        loss, logits = outputs[:2]


        if torch.cuda.device_count() == 1:
            loss.backward()
            epoch_loss += loss.cpu().item()

        elif torch.cuda.device_count() > 1:
            loss.mean().backward()
            epoch_loss += loss.mean().cpu().item()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()
        model.zero_grad()
        scheduler.step()
        optimizer.zero_grad()

    # free GPU memory
    if device == 'cuda':
        torch.cuda.empty_cache()

    return epoch_loss / len(iterator)

### Define Evaluation

In [6]:
def evaluate(model, iterator, metric, is_regression = False, isPair = False):
    AvgRec=0.00
    Fpn=0.00
    model.eval()
    epoch_loss = 0

    all_pred=[]
    all_label = []

    with torch.no_grad():
        for _, batch in enumerate(iterator, 0):
        # Add batch to GPU
            input_ids = batch['ids'].to(device, dtype = torch.long)
            attention_mask = batch['mask'].to(device, dtype = torch.long)
            try:
                token_type_ids = batch['token'].to(device, dtype=torch.long)
            except:
                token_type_ids = None
            labels = batch['targets'].to(device, dtype = torch.long)
            
            if (len(input_ids.shape) == 3):
                input_ids = input_ids.view((-1, input_ids.size(-1)))
                attention_mask = attention_mask.view((-1, input_ids.size(-1)))
            if (token_type_ids and (len(token_type_ids.shape) == 3) and (isPair==True)):
                token_type_ids = token_type_ids.view((-1, input_ids.size(-1)))
            
            outputs = model(input_ids=input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, labels = labels)
            loss, logits = outputs[:2]

            # delete used variables to free GPU memory
            del batch, input_ids, attention_mask, token_type_ids

            if torch.cuda.device_count() == 1:
                epoch_loss += loss.cpu().item()
            else:
                epoch_loss += loss.sum().cpu().item()
            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(logits.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(logits.cpu() if is_regression else predicted)
            all_label.extend(labels.cpu())

   
    if(is_regression):
        result =  {"mse": (np.array((all_pred - all_label) ** 2)).mean().item()}
    else:
        result = metric(predictions=all_pred, references=all_label)
    return epoch_loss/len(iterator), result

### Create Optimizer and Metric

In [7]:
def create_optimizer_and_scheduler(total_params, num_training_steps, warmup_steps, weight_decay, learning_rate, is_constant_lr):
    """
    Setup the optimizer and the learning rate scheduler.
    """
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in total_params if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in total_params if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=learning_rate
    )
    
    if is_constant_lr == True:
    	lr_scheduler = get_constant_schedule(optimizer)
    else:
	    lr_scheduler = get_linear_schedule_with_warmup(
	        optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
	    )
    return optimizer, lr_scheduler

In [8]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean().item()

def accuracy(predictions, references):
    acc = accuracy_score(references, predictions)
    return {
        "accuracy": acc,
    }

### Finetuning Function

In [9]:
def fine_tuning(model, tokenizer, config, seed):
    set_seed(seed)
    #---------------------------------------

    task_name = config["task_name"]
    text_col_1 = config["text_column_1"]
    isPair = False ## Not Pair Sentence Classification
    try:
        text_col_2 = config["text_column_2"]
        if(text_col_2 is not None):
            isPair = True
    except: 
        text_col_2 = None
    label_col = config["label_column"]

    train_file = os.path.join(config["data_dir"], config["train_file"])


    try:
        is_constant_lr = config["is_constant_lr"]
    except: 
        is_constant_lr = False


        
    dev_file = os.path.join(config["data_dir"], config["dev_file"])
    test_file = os.path.join(config["data_dir"], config["test_file"])


    max_seq_length= int(config["max_seq_length"])
    batch_size = int(config["batch_size"])

    try: 
        early_stop = config["early_stop"]
    except:
        early_stop = 5

    try:
        save_model = config["save_model"]
    except: 
        save_model = False


    learning_rate = float(config["lr"]) 
    model_path = config['pretrained_model_path']
    num_epochs = config['epochs']

    #---------------------------------------------------------
    print("[INFO] step (2) check checkpoit directory and report file:")
    ckpt_dir = config["ckpt_dir"] + "/"

    #-------------------------------------------------------
    print("[INFO] step (3) load label to number dictionary:")
    
    delimiter = None
    if(str(train_file).endswith('tsv')):
        delimiter = '\t'
    df = pd.read_csv(train_file, delimiter=delimiter)
    labels = df[label_col].tolist()
    is_regression = False
    if(isinstance(labels[0], float)):
        is_regression = True
        lab2ind = {'float':0}
    
    unique_labels = list(set(labels))
    lab2ind = {l:ind for ind,l in enumerate(unique_labels)}
    
    
    try:
        num_workers = config['num_workers']
    except:
        num_workers = 1
    
    
    print("[INFO] train_file", train_file)
    print("[INFO] dev_file", dev_file)
    print("[INFO] test_file", test_file)
    print("[INFO] num_epochs", num_epochs)
    print("[INFO] model_path", model_path)
    print("[INFO] max_seq_length", max_seq_length)
    print("[INFO] batch_size", batch_size)
    print("[INFO] Number of Classes", len(lab2ind))
    print("[INFO] Number of Workers", num_workers)
    print("[INFO] step (4) Use defined funtion to extract tokanize data")

    criterion = nn.CrossEntropyLoss()
    if(is_regression):
        criterion = nn.MSELoss()


    print("[INFO] step (5) Create an iterator of data with torch DataLoader.")

    train_dataloader = regular_encode(train_file, tokenizer, lab2ind, True, batch_size=batch_size, maxlen = max_seq_length, mode = "train",
                                     text_col_1 = text_col_1, text_col_2 = text_col_2, label_col = label_col)
    validation_dataloader = regular_encode(dev_file, tokenizer, lab2ind, True, batch_size=batch_size, maxlen = max_seq_length, mode = "evaluate",
                                     text_col_1 = text_col_1, text_col_2 = text_col_2, label_col = label_col)

    print("[INFO] step (6) run with parallel CPU/GPUs")
    if torch.cuda.is_available():
        if torch.cuda.device_count() == 1:
            print("Run",model_path, "with one GPU")
            model = model.to(device)


    #---------------------------------------------------
    print("[INFO] step (7) set Parameters, schedules, and loss function:")
    global max_grad_norm
    max_grad_norm = 1.0
    try:
        warmup_proportion = config["warmup_proportion"]
    except: 
        warmup_proportion = 0.06

    num_training_steps	= len(train_dataloader) * num_epochs
    num_warmup_steps = num_training_steps * warmup_proportion
    ### In Transformers, optimizer and schedules are instantiated like this:
    # Note: AdamW is a class from the huggingface library
    # the 'W' stands for 'Weight Decay"
    weight_decay = 0.01
    
    # schedules
    total_params = list(model.named_parameters())

    optimizer, scheduler = create_optimizer_and_scheduler(total_params, num_training_steps, num_warmup_steps, weight_decay, learning_rate, is_constant_lr)

    metric = accuracy
    
    print("[INFO] step (8) start fine_tuning")
    
    
    print("[INFO] step (8) start fine_tuning")
    
    for epoch in trange(num_epochs, desc="Epoch"):
        print(f'Epoch: {epoch+1}')
        train_loss = train(model, train_dataloader, optimizer, scheduler, isPair = isPair)	 
        eval_loss, eval_result = evaluate(model, validation_dataloader, metric, isPair=isPair)
        print(f'Train Loss: {train_loss}')
        print(eval_result)

    

### Run Model

In [10]:
config = {}

config['task_name'] = "Binary Sentiment Classification"
config['text_column_1'] = "sentence"
config['text_column_2'] = None
config['label_column'] = "label"
config['data_dir'] = "./"
config['train_file'] = "sst2_tiny.csv"
config['dev_file'] = "sst2_tiny.csv"
config['test_file'] = "sst2_tiny.csv"
config['is_constant_lr'] = False
config['max_seq_length'] = 4096
config['batch_size'] = 5
config['early_stop'] = 5
config['save_model'] = False
config['lr'] = 0.00005
config['pretrained_model_path'] = "allenai/longformer-base-4096"
config['epochs'] = 3
config['ckpt_dir'] = "ckpt"
config['warmup_proportion'] = 0.05
config['metric'] = "accuracy"
config['num_workers'] = 1


seed = 42

fine_tuning(model, tokenizer, config, seed)



[INFO] step (2) check checkpoit directory and report file:
[INFO] step (3) load label to number dictionary:
[INFO] train_file ./sst2_tiny.csv
[INFO] dev_file ./sst2_tiny.csv
[INFO] test_file ./sst2_tiny.csv
[INFO] num_epochs 3
[INFO] model_path allenai/longformer-base-4096
[INFO] max_seq_length 4096
[INFO] batch_size 5
[INFO] Number of Classes 2
[INFO] Number of Workers 1
[INFO] step (4) Use defined funtion to extract tokanize data
[INFO] step (5) Create an iterator of data with torch DataLoader.
./sst2_tiny.csv Dataset: (10, 3)
./sst2_tiny.csv Dataset: (10, 3)
[INFO] step (6) run with parallel CPU/GPUs
[INFO] step (7) set Parameters, schedules, and loss function:
[INFO] step (8) start fine_tuning
[INFO] step (8) start fine_tuning


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 1



Iteration:   0%|          | 0/2 [00:00<?, ?it/s][A
Iteration:  50%|█████     | 1/2 [00:58<00:58, 58.70s/it][A
Iteration: 100%|██████████| 2/2 [01:56<00:00, 58.42s/it][A
Epoch:  33%|███▎      | 1/3 [03:41<07:23, 221.90s/it]

Train Loss: 0.0
{'accuracy': 0.6}
Epoch: 2



Iteration:   0%|          | 0/2 [00:00<?, ?it/s][A
Iteration:  50%|█████     | 1/2 [00:59<00:59, 59.10s/it][A
Iteration: 100%|██████████| 2/2 [01:56<00:00, 58.43s/it][A
Epoch:  67%|██████▋   | 2/3 [07:22<03:41, 221.27s/it]

Train Loss: 0.0
{'accuracy': 0.6}
Epoch: 3



Iteration:   0%|          | 0/2 [00:00<?, ?it/s][A
Iteration:  50%|█████     | 1/2 [00:59<00:59, 59.50s/it][A
Iteration: 100%|██████████| 2/2 [01:57<00:00, 58.63s/it][A
Epoch: 100%|██████████| 3/3 [11:04<00:00, 221.40s/it]

Train Loss: 0.0
{'accuracy': 0.6}



