### This is a sample code snippet for fine-tuning a Longerformer model.
### https://huggingface.co/docs/transformers/en/model_doc/longformer
### https://huggingface.co/allenai/longformer-base-4096

In [None]:
%%writefile fine_tune_LF.py


# Lib versions
# transformers_version='4.6'
# pytorch_version='1.6'
# py_version='py36'


import pandas as pd
import os
import torch
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
from tqdm import tqdm
from datasets import Dataset
import warnings
import pickle as plk


# Load the data
# Provide the path to your data. Here, we assume the data is a list, as shown in the example below
# [{'text': 'Large Transformer models routinely achieve state-of-the-art results on a number of tasks ...',
#   'id': 0}, 
#  {'text': 'The resulting model, the Reformer, performs on par with Transformer models while being ...',
#   'id': 1},]
data_location = "YOUR_DATA_PATH"
with open(data_location, "rb") as fin:
    all_texts = plk.load(fin)
df_train = pd.DataFrame(all_texts)
dataset = Dataset.from_pandas(df_train)


# Load the model and tokenizer
MODEL_CKPT = 'allenai/longformer-base-4096'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForMaskedLM.from_pretrained(MODEL_CKPT).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)


# Prepare the dataset
def tokenize_function(batched_data):
    result = tokenizer(batched_data['text'], padding='max_length', truncation=True, 
                       max_length=4096, return_token_type_ids=False, 
                       return_attention_mask=True, pad_to_max_length=True)
    # if tokenizer.is_fast:
    #     result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text', 'id'])
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
dataset_split = tokenized_datasets.train_test_split(test_size=0.1) # Split the dataset to train and test


# Define the training args
batch_size = 1
logging_steps = len(dataset_split["train"]) // batch_size
model_name = MODEL_CKPT.split("/")[-1]
training_args = TrainingArguments(
    output_dir="YOUR_OUTPUT_PATH",
    overwrite_output_dir=True,
    num_train_epochs = 1,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    # fp16=True,
    gradient_accumulation_steps = 4,
    logging_steps=logging_steps,
    warmup_steps=0,
)


# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    data_collator=data_collator,
)


# Train
trainer.train()


# Evaluate the model
eval_result = trainer.evaluate(eval_dataset=dataset_split["test"])


# Saves the model
save_path = "YOUT_SAVE_PATH"
trainer.save_model(save_path)