# LMTrain-GPU

In this notebook we show how we do authorial GPT-2s training on GPU.

Install dependencies.

In [None]:
!pip install transformers datasets

Import packages.

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments,Trainer
from datasets import Dataset
import os,math,shutil

Set up variables here.

log_home: the directory containing two text files done.txt and target.txt. To start, place the author_tags line by line in target.txt and leave done.txt blank. After an authorial GPT-2 is successfully fine-tuned and saved to hard drive, its label will be logged in done.txt and hence be skipped when the notebook re-started.

model_home: the directory for the fine-tuned model output.

data_path: the directory for the dataset. Typically under such directory should have two csv: train.csv and test.csv

author_tag_label: the column name for author_tag, in GEFA, it is "author_tag". Change this only when you want to use this notebook for a dataset other than GEFA series.

In [None]:
log_home="log"
model_home="model"
data_path="corpus/GEFA-full"
author_tag_label="author_tag"

Load working environment.

In [None]:
train_data_path=os.path.join(data_path,"train.csv")
target_log=os.path.join(log_home,"target.txt")
done_log=os.path.join(log_home,"done.txt")
if(os.path.isdir("gpt2-buffer")): shutil.rmtree("gpt2-buffer")
os.environ["WANDB_DISABLED"]="true"
if(not(os.path.isdir(model_home))):os.mkdir(model_home)

Set up training tasks.

In [None]:
with open(target_log,"r",-1,"utf-8") as f:
    target_content=f.read().strip().rstrip()
with open(done_log,"r",-1,"utf-8") as f:
    done_content=f.read().strip().rstrip()
target_lines=target_content.split()
done_lines=done_content.split()

to_do_lines=[target_line for target_line in target_lines if target_line not in done_lines]

print("Fetched tasks:")
print("\n".join(to_do_lines))
print("******")

Load dataset.

In [None]:
total_dataset=Dataset.from_csv(train_data_path)

Run the script below to start training.

In [None]:
for to_do_line in to_do_lines:
    to_do_line_cells=to_do_line.split(",")
    author_tag=to_do_line_cells[0]
    
    print(f"Start processing:{author_tag}")

    target_train_set=total_dataset.filter(lambda daton:str(daton[author_tag_label])==str(author_tag))
    
    dataset=target_train_set.train_test_split(test_size=0.2,shuffle=True)

    tokenizer=AutoTokenizer.from_pretrained("gpt2")
    
    def preprocess_function(examples):
        return tokenizer(examples["text"])
    
    tokenizer.pad_token=tokenizer.eos_token
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    lm_dataset=dataset.map(preprocess_function, batched=True,num_proc=8,remove_columns=dataset["train"].column_names)
    
    def group_texts(examples,block_size=128):

        concatenated_examples={key: sum(examples[key], []) for key in examples.keys()}
        total_length=len(concatenated_examples[list(examples.keys())[0]])
        if total_length >=block_size:
            total_length=(total_length // block_size) * block_size
        result={
            key: [token[i : i + block_size] for i in range(0, total_length, block_size)]
            for key, token in concatenated_examples.items()
        }
        result["labels"]=result["input_ids"].copy()
        return result
    
    lm_dataset=lm_dataset.map(group_texts,batched=True,num_proc=8)
    model_checkpoint="gpt2"
    model=AutoModelForCausalLM.from_pretrained(model_checkpoint)
    training_args=TrainingArguments(
        output_dir="gpt2-buffer",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        num_train_epochs=100,
        gradient_accumulation_steps=64,
        fp16=True,
        push_to_hub=False,
    )
    trainer=Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=lm_dataset["train"],
        eval_dataset=lm_dataset["test"],
        data_collator=data_collator
    )

    trainer.train()
    trainer.save_model(os.path.join(model_home,author_tag))
    eval_results=trainer.evaluate()
    eval_str=f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}"
    print(eval_str)
    
    with open(os.path.join(model_home,author_tag,"eval.txt"),"a",-1,"utf-8") as f:
        f.write(eval_str)
    with open(done_log,"a",-1,"utf-8") as f:
        f.write(to_do_line+"\n")
        
    if(os.path.isdir("gpt2-buffer")): shutil.rmtree("gpt2-buffer")
          
    print(f"Finished processing:{author_tag}")