# 📖 W&B-tracked Shortformers experiments

![](https://storage.googleapis.com/kaggle-competitions/kaggle/31779/logos/header.png)


## This notebook defines a reduced version of the competition [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021) data in order to experiment in a fast fashion logging results to W&B.

## You can check the results of the current experiments here: https://wandb.ai/dataista/fp-models-exp1 and, in general, of all experiments, here: https://wandb.ai/dataista/. I'm making the results public.


There is a cell with various configuration parameters that affect the full experiment.
The keys `train_data_size` and `val_data_size` allow to reduce the dataset size in order to run epochs faster. Hopefully, the results obtained with this smaller versions are correlated to the results of the full problem.

All the code is wrapped up in a `run` facade function, making it easier to copy everything into a python script and run it from somewhere else.

For using W&B you need to set up a user and configure the API key in the Add-ons menu here in the notebook editor. You can check the instructions provided for this, for example, in this great notebook: [[Pytorch + W&B] Jigsaw Starter](https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter) by [Debarshi Chanda](https://www.kaggle.com/debarshichanda).

This is a follow-up of the [📖 PyTorch- "ShortFormer" w/Chunks - Train [0.624]](https://www.kaggle.com/julian3833/pytorch-shortformer-w-chunks-train-0-624) that frames the problem as a token classification ("NER"-like) problem and uses small transformers (not longformer or bigbird) using the old-fashioned chunk, predict and regroup strategy.


# Please _DO_ upvote if you found this kernel useful or interesting! 🤗

&nbsp;
&nbsp;

Ok, let's go!

# Imports

In [None]:
import os
import gc
import ast
import time
import wandb
from tqdm import tqdm
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Run Configuration

This is the main configuration for the execution

In [None]:

config = {'train_batch_size': 4,
          'valid_batch_size': 2,
          'epochs': 5,
          'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
          'max_grad_norm': 10,
          'device': 'cuda' if torch.cuda.is_available() else 'cpu',
          'model_name': 'roberta-large',
          'max_length': 512,
          'doc_stride': 128,
          'train_data_size': 0.33,
          'val_data_size': 0.33,
          'experiment_description': 'Roberta Large LR=Chris'
          }

WANDB_ENTITY = "dataista"
RUN_NAME = f"{config['model_name']}-lr=chris"
WANDB_PROJECT = "fp-roberta-lr"

# Code

In [None]:
def download_model():
    # https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615
    if os.path.isdir('model'):
        return 
    
    os.mkdir('model')
    
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'], add_prefix_space=True)
    tokenizer.save_pretrained('model')

    config_model = AutoConfig.from_pretrained(config['model_name']) 
    config_model.num_labels = 15
    config_model.save_pretrained('model')

    backbone = AutoModelForTokenClassification.from_pretrained(config['model_name'], 
                                                               config=config_model)
    backbone.save_pretrained('model')
    print(f"Model downloaded to model/")
    

def get_labels(word_ids, word_labels):
    label_ids = []
    for word_idx in word_ids:                            
        if word_idx is None:
            label_ids.append(-100)
        else:
            label_ids.append(LABELS_TO_IDS[word_labels[word_idx]])
    return label_ids


# Tokenize texts, possibly generating more than one tokenized sample for each text
def tokenize(df, tokenizer, to_tensor=True, with_labels=True):
    
    # This is what's different from a longformer
    # Read the parameters with attention
    encoded = tokenizer(df['text_split'].tolist(),
                        is_split_into_words=True,
                        return_overflowing_tokens=True,
                        stride=config['doc_stride'],
                        max_length=config['max_length'],
                        padding="max_length",
                        truncation=True)

    if with_labels:
        encoded['labels'] = []

    encoded['wids'] = []
    n = len(encoded['overflow_to_sample_mapping'])
    for i in range(n):

        # Map back to original row
        text_idx = encoded['overflow_to_sample_mapping'][i]
        
        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)
        
        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df['entities'].iloc[text_idx]
        
            # Get the labels associated with the word indexes
            label_ids = get_labels(word_ids, word_labels)
            encoded['labels'].append(label_ids)
        encoded['wids'].append([w if w is not None else -1 for w in word_ids])
    
    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded


class FeedbackPrizeDataset(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data['input_ids'])
    
    
# from Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score


def inference(dl, model):
    
    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks
    predictions = defaultdict(list)
    seen_words_idx = defaultdict(list)
    
    val_loss = 0.0
    steps = 0
    examples = 0
    val_accuracy = 0
    
    for batch in dl:
        ids = batch["input_ids"].to(config['device'])
        mask = batch["attention_mask"].to(config['device'])
        labels = batch['labels'].to(config['device'], dtype = torch.long)
        loss, logits = model(ids, attention_mask=mask, labels=labels, return_dict=False)
        
        del ids, mask
        
        batch_preds = torch.argmax(logits, axis=-1).cpu().numpy() 
        val_loss += loss.item()

        steps += 1
        examples += labels.size(0)
        
        #import pdb; pdb.set_trace()
        
        
   
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        labels = torch.masked_select(flattened_targets, active_accuracy)
        y_predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tmp_accuracy = accuracy_score(labels.cpu().numpy(), y_predictions.cpu().numpy())
        val_accuracy += tmp_accuracy
    
        # Go over each prediction, getting the text_id reference
        for k, (chunk_preds, text_id) in enumerate(zip(batch_preds, batch['overflow_to_sample_mapping'].tolist())):
            
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()
            
            # Map from ids to labels
            chunk_preds = [IDS_TO_LABELS[i] for i in chunk_preds]        
            
            for idx, word_idx in enumerate(word_ids):                            
                if word_idx == -1:
                    pass
                elif word_idx not in seen_words_idx[text_id]:
                    # Add predictions if the word doesn't have a prediction from a previous chunk
                    predictions[text_id].append(chunk_preds[idx])
                    seen_words_idx[text_id].append(word_idx)
    
    val_loss = val_loss / steps
    val_accuracy = val_accuracy / steps
    
    final_predictions = [predictions[k] for k in sorted(predictions.keys())]
    return final_predictions, val_loss, val_accuracy


# https://www.kaggle.com/zzy990106/pytorch-ner-infer
# code has been modified from original
# I moved the iteration over the batches to inference because  
# samples from the same text might have be split into different batches
def get_predictions(df, dl, model):
    
    all_labels, val_loss, val_accuracy = inference(dl, model)
    final_preds = []
    
    for i in range(len(df)):
        idx = df.id.values[i]
        pred = all_labels[i]
        preds = []
        j = 0
        
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': pass
            else: cls = cls.replace('B','I')
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            if cls != 'O' and cls != '' and end - j > 7:
                final_preds.append((idx, cls.replace('I-',''), 
                                    ' '.join(map(str, list(range(j, end))))))
            j = end
        
    df_pred = pd.DataFrame(final_preds)
    df_pred.columns = ['id','class','predictionstring']
    return df_pred, val_loss, val_accuracy


def validate(model, df_all, df_val, dl_val, epoch, epoch_start):
    
    time_start = time.time()
    
    # Put model in eval model
    model.eval()
    
    # Valid targets: needed because df_val has a subset of the columns
    df_valid = df_all.loc[df_all['id'].isin(IDS[valid_idx])]

    # OOF predictions
    oof,  val_loss, val_accuracy = get_predictions(df_val, dl_val, model)

    # Compute F1-score
    f1s = []
    classes = oof['class'].unique()
    
    epoch_prefix = f"[Epoch {epoch+1:2d} / {config['epochs']:2d}]"
    #print(f"{epoch_prefix} Validation F1 scores")
    
    val_log = {}
    for c in classes:
        pred_df = oof.loc[oof['class']==c].copy()
        gt_df = df_valid.loc[df_valid['discourse_type']==c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        #print(f"{epoch_prefix}   * {c:<10}: {f1:4f}")
        f1s.append(f1)
        val_log[f'F1 {c}'] = f1
    
    elapsed = time.time() - time_start
    epoch_end = time.time() - epoch_start
    
    #print(epoch_prefix)
    print(f"{epoch_prefix} Val. loss        : {val_loss:.4f}")
    print(f'{epoch_prefix} Val accuracy     : {val_accuracy:.4f}')
    print(f'{epoch_prefix} Validation F1    : {np.mean(f1s):.4f}')
    print(f"{epoch_prefix} Validation time  : {elapsed/60:.2f} mins")
    print(f"{epoch_prefix} Epoch time       : {epoch_end/60:.2f} mins")
    print(epoch_prefix)
    
    
    val_log['Overall F1'] = np.mean(f1s)
    val_log['Val Loss'] = val_loss
    val_log['Val Accuracy'] = val_accuracy
    val_log['Validation Time'] = elapsed
    val_log['Epoch Time'] = epoch_end
    wandb.log(val_log)
    
    
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
def train(model, optimizer, dl_train, epoch):
    
    time_start = time.time()
    
    # Set learning rate to the one in config for this epoch
    for g in optimizer.param_groups: 
        g['lr'] = config['learning_rates'][epoch]
    lr = optimizer.param_groups[0]['lr']
    
    
    epoch_prefix = f"[Epoch {epoch+1:2d} / {config['epochs']:2d}]"
    print(f"{epoch_prefix} Starting epoch {epoch+1:2d} with LR = {lr}")
    
    # Put model in training mode
    model.train()
    
    # Accumulator variables
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for idx, batch in enumerate(dl_train):
        
        ids = batch['input_ids'].to(config['device'], dtype = torch.long)
        mask = batch['attention_mask'].to(config['device'], dtype = torch.long)
        labels = batch['labels'].to(config['device'], dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels,
                               return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        loss_step = tr_loss/nb_tr_steps
        
        if idx % 200 == 0:
            print(f"{epoch_prefix}     Steps: {idx:4d} --> Loss: {loss_step:.4f}")
        
   
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        wandb.log({'Train Loss (Step)': loss_step, 'Train Accuracy (Step)' : tr_accuracy / nb_tr_steps})
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=config['max_grad_norm']
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    
    torch.save(model.state_dict(), f'pytorch_model_e{epoch}.bin')
    torch.cuda.empty_cache()
    gc.collect()

    elapsed = time.time() - time_start
    
    print(epoch_prefix)
    print(f"{epoch_prefix} Training loss    : {epoch_loss:.4f}")
    print(f"{epoch_prefix} Training accuracy: {tr_accuracy:.4f}")
    print(f"{epoch_prefix} Training time    : {elapsed/60:.2f} mins")
    #print(epoch_prefix)
    #print(f"{epoch_prefix} Model saved to pytorch_model_e{epoch}.bin  ")
    
    wandb.log({'Train Loss (Epoch)': epoch_loss, 'Train Accuracy (Epoch)' : tr_accuracy, 'Train Time' : elapsed})
    

def run():
    time_start = time.time()
    global LABELS_TO_IDS, IDS_TO_LABELS, IDS, valid_idx, train_idx
    df_all = pd.read_csv('../input/feedback-prize-2021/train.csv')
    df_texts = pd.read_csv("../input/feedback-prize-train-ner-csv/train_NER.csv",
                            converters={'entities':ast.literal_eval, 'text_split': ast.literal_eval})
    # Create global dictionaries to use during training and inference

    # https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615
    output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
              'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

    LABELS_TO_IDS = {v:k for k,v in enumerate(output_labels)}
    IDS_TO_LABELS = {k:v for k,v in enumerate(output_labels)}


    # CHOOSE VALIDATION INDEXES
    IDS = df_all.id.unique()
    print(f'There are {len(IDS)} train texts. We will split 90% 10% for validation.')

    # TRAIN VALID SPLIT 90% 10%
    np.random.seed(42)
    train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
    valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)
    np.random.seed(None)
    
    n_train = len(train_idx)
    n_val   = len(valid_idx)
    
    n_train_new = int(config['train_data_size'] * n_train)
    n_val_new = int(config['val_data_size'] * n_val)
    
    train_idx = train_idx[:n_train_new]
    valid_idx = valid_idx[:n_val_new]
    

    # CREATE TRAIN SUBSET AND VALID SUBSET
    df_train = df_texts.loc[df_texts['id'].isin(IDS[train_idx])].reset_index(drop=True)
    df_val = df_texts.loc[df_texts['id'].isin(IDS[valid_idx])].reset_index(drop=True)

    print(f"FULL Dataset : {df_texts.shape}")
    print(f"Data size - Train: {config['train_data_size']} Val: {config['val_data_size']}")
    
    print(f"TRAIN Dataset: {df_train.shape}")
    print(f"TEST Dataset : {df_val.shape}")
    
    download_model()
    tokenizer = AutoTokenizer.from_pretrained('model')

    # Tokenize both training and validation dataframes
    tokenized_train = tokenize(df_train, tokenizer)
    tokenized_val = tokenize(df_val, tokenizer)


    # Create Datasets and DataLoaders for training and validation dat

    ds_train = FeedbackPrizeDataset(tokenized_train)
    dl_train = DataLoader(ds_train, batch_size=config['train_batch_size'], 
                          shuffle=True, num_workers=2, pin_memory=True)

    ds_val = FeedbackPrizeDataset(tokenized_val)
    dl_val = DataLoader(ds_val, batch_size=config['valid_batch_size'], 
                        shuffle=False, num_workers=2, pin_memory=True)

    config_model = AutoConfig.from_pretrained('model/config.json') 
    model = AutoModelForTokenClassification.from_pretrained('model/pytorch_model.bin',config=config_model)
    model.to(config['device']);

    # Instantiate optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

    # Loop
    for epoch in range(config['epochs']):
        epoch_start = time.time()
        print()
        train(model, optimizer, dl_train, epoch)
        validate(model, df_all, df_val, dl_val, epoch, epoch_start)

    elapsed = time.time() - time_start
    print(f"Final model saved as 'pytorch_model.bin' [{elapsed/60:.2f} mins]")
    
    torch.save(model.state_dict(), 'pytorch_model.bin')
    
    
def start_wandb():
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=RUN_NAME, config=config)

In [None]:
start_wandb()
run()
wandb.finish()

# Please _DO_ upvote if you found this kernel useful or interesting! 🤗