This baseline is based on the following notebooks 

by Sylvain Gugger: https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb

by DAREK KŁECZEK: https://www.kaggle.com/thedrcat/feedback-prize-huggingface-baseline-training/notebook

i don't use metric for simple, if you want to use metric, you can see the above notebooks.

In [None]:
%%capture
!pip install --no-index --find-links ../input/python-package-amulil/datasets/datasets datasets

In [None]:
# basic utils
import gc
import psutil
import random
import numpy as np
import pandas as pd
import torch
import os
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold

# for transformer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric, Dataset
from transformers.utils.logging import set_verbosity, WARNING, INFO

# system congi
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# helper function
def dataset_size(dataset):
    size_gb = dataset.dataset_size / (1024**2)
    print(f"Dataset size (cache file) : {size_gb:.2f} MB")
    
def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

def get_raw_text(ids):
    with open(f"../input/feedback-prize-2021/train/{ids}.txt", 'r') as file: data = file.read()
    return data

def get_test_text(ids):
    with open(f"../input/feedback-prize-2021/test/{ids}.txt", 'r') as file: data = file.read()
    return data

def tokenize_and_align_labels(examples):
    o = CONFIG["tokenizer"](examples["text"], truncation=True, max_length=CONFIG["max_length"],
                            return_offsets_mapping=True)
    
    offsets = o['offset_mapping']
    labels = []
    for k in range(len(offsets)):
        label = []
        for i in o.tokens(k):
            if i == None:
                label.append(-100)
            else:
                label.append(l2i["O"])
                
        for a, b, t in \
            zip(examples["starts"][k], examples["ends"][k], examples["classlist"][k]):
            offset_index = 1
            c = offsets[k][offset_index][0]
            d = offsets[k][offset_index][1]
            beginning = True
            while b>c:
                if (c>=a)&(b>=d):
                    if beginning:
                        label[offset_index] = l2i[f'B-{t}']  
                        beginning = False
                    else:
                        label[offset_index] = l2i[f'I-{t}']  
                offset_index += 1
                if offset_index>len(offsets[k])-1:
                    break
                c = offsets[k][offset_index][0]
                d = offsets[k][offset_index][1]
        labels.append(label)

    o["labels"] = labels
        
    return o

def tokenize_for_test(examples):
    o = CONFIG["tokenizer"](examples['text'], truncation=True, return_offsets_mapping=True)
    return o

def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_datas = Dataset.from_pandas(df_train)
    valid_datas = Dataset.from_pandas(df_valid)    
    
    tokenized_train_datas = train_datas.map(tokenize_and_align_labels, 
                                                    batched=True,
                                                    batch_size=5000,
                                                    remove_columns=train_datas.column_names)
    
    tokenized_valid_datas = valid_datas.map(tokenize_and_align_labels, 
                                                    batched=True,
                                                    batch_size=5000,
                                                    remove_columns=valid_datas.column_names)
    
    
    return tokenized_train_datas, tokenized_valid_datas

In [None]:
set_seed()

devices = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

checkpoint = [
    "../input/amulil-huggingface/allenai/longformer-base-4096",
    "../input/amulil-huggingface/google/bigbird-roberta-base"
]

CONFIG = {
    "debug": True,
    "max_length":1024,
    "seed": 666,
    "batch_size": 4,
    "checkpoint": checkpoint[0],
    "task": "ner",
    "n_fold": 2,
    "epochs": 1
}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG["checkpoint"])

In [None]:
all_datas = pd.read_csv('../input/feedback-prize-2021/train.csv')
classes = all_datas.discourse_type.unique().tolist()
if CONFIG["debug"]: all_datas = all_datas.sample(n=10).reset_index(drop=True)
classes

In [None]:
df1 = all_datas.groupby('id')['discourse_type'].apply(list).reset_index(name='classlist')
df2 = all_datas.groupby('id')['discourse_start'].apply(list).reset_index(name='starts')
df3 = all_datas.groupby('id')['discourse_end'].apply(list).reset_index(name='ends')
df4 = all_datas.groupby('id')['predictionstring'].apply(list).reset_index(name='predictionstrings')

df = pd.merge(df1, df2, how='inner', on='id')
df = pd.merge(df, df3, how='inner', on='id')
df = pd.merge(df, df4, how='inner', on='id')
df['text'] = df['id'].apply(get_raw_text)

print(len(df))

skf = KFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=42)

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.classlist)):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

In [None]:
tags = defaultdict()

tags[f'O'] = 0
for i, c in enumerate(classes):
    tags[f'B-{c}'] = 2*i + 1
    tags[f'I-{c}'] = 2*i + 2
    
l2i = dict(tags)

i2l = defaultdict()
for k, v in l2i.items(): 
    i2l[v] = k

i2l = dict(i2l)

N_LABELS = len(i2l) - 1 # not accounting for -100
l2i

In [None]:
for fold in range(0, CONFIG['n_fold']):
    print(f"====== Fold: {fold} ======")
    
    tokenized_train_datas, tokenized_valid_datas = prepare_loaders(fold)
    data_collator = DataCollatorForTokenClassification(tokenizer=CONFIG['tokenizer'])
    
    set_verbosity(WARNING)
    model = AutoModelForTokenClassification.from_pretrained(
        CONFIG["checkpoint"],
        id2label=i2l,
        label2id=l2i,
    )

    model_name = CONFIG['checkpoint'].split("/")[-1]
    task = CONFIG['task']
    set_verbosity(WARNING)
    args = TrainingArguments(
        f"{model_name}-finetuned-fold{fold}-{task}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=1,
        per_device_train_batch_size=CONFIG["batch_size"],
        per_device_eval_batch_size=CONFIG["batch_size"],
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_train_datas,
        eval_dataset=tokenized_valid_datas,
        data_collator=data_collator,
        tokenizer=CONFIG["tokenizer"],
    )
    set_verbosity(WARNING)
    trainer.train()
    
    del model, tokenized_train_datas, tokenized_valid_datas, trainer, args
    _ = gc.collect()