# Finetuning an LLM to Generate Book Plots

### Initial Setup

In [4]:
model_name = "distilgpt2"
device = "cuda"
dataset_path = "./goodreads_top100.csv"

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Import a Dataset

In [5]:
import pandas as pd

df = pd.read_csv(dataset_path)
df

Unnamed: 0.1,Unnamed: 0,isbn,title,series_title,series_release_number,authors,publisher,language,description,num_pages,format,genres,publication_date,rating_score,num_ratings,num_reviews,current_readers,want_to_read,price,url
0,0,9780689830594,Summer Story,Brambly Hedge,2,Jill Barklem,Atheneum,English,It was such a hot summer. The sky was deep blu...,32,Hardcover,"['Picture Books', 'Childrens', 'Fiction', 'Ani...","January 1, 1980",4.45,1017.0,74.0,7.0,512.0,3.49,https://www.goodreads.com/book/show/421572.Sum...
1,1,9780375704970,The Lake of Darkness,,,Ruth Rendell,Vintage Crime/Black Lizard,English,Martin Urban is a quiet bachelor with a comfor...,210,Paperback,"['Mystery', 'Fiction', 'Crime', 'Thriller', 'B...","January 1, 1980",3.76,1388.0,114.0,77.0,623.0,4.99,https://www.goodreads.com/book/show/83394.The_...
2,2,9780345446671,Beyond the Blue Event Horizon,Heechee Saga,2,Frederik Pohl,Ballantine Books,English,"In Book Two of the Heechee Saga, Robinette Bro...",336,Paperback,"['Science Fiction', 'Fiction', 'Space Opera', ...","January 1, 1980",3.95,13307.0,339.0,181.0,3961.0,11.99,https://www.goodreads.com/book/show/373399.Bey...
3,3,9780446403016,St. Peter's Fair,Chronicles of Brother Cadfael,4,Ellis Peters,Mysterious Press,English,A pause in the civil war offers Shrewsbury's t...,217,Mass Market Paperback,"['Mystery', 'Historical Fiction', 'Fiction', '...","May 1, 1981",4.12,10493.0,593.0,1298.0,2502.0,0.00,https://www.goodreads.com/book/show/751755.St_...
4,4,9780425198773,Twice Shy,,,Dick Francis,G.P. Putnam's Sons,English,A computerized horse-betting system falls into...,304,Mass Market Paperback,"['Mystery', 'Fiction', 'Thriller', 'Crime', 'S...","January 1, 1981",3.92,4188.0,174.0,162.0,642.0,8.99,https://www.goodreads.com/book/show/103250.Twi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4395,4395,9781451648218,Murder Your Employer: The McMasters Guide to H...,,,Rupert Holmes,Avid Reader Press / Simon & Schuster,English,A devilish thriller with a killer concept: The...,389,Hardcover,"['Mystery', 'Fiction', 'Thriller', 'Mystery Th...","February 21, 2023",3.89,20992.0,3479.0,4033.0,84900.0,12.99,https://www.goodreads.com/book/show/61272658-m...
4396,4396,9781250826978,System Collapse,The Murderbot Diaries,7,Martha Wells,Tor Publishing Group/Tordotcom,English,Everyone's favorite lethal SecUnit is back.\r\...,245,Hardcover,"['Science Fiction', 'Fiction', 'Audiobook', 'A...","November 13, 2023",4.24,26566.0,3479.0,3609.0,36600.0,11.99,https://www.goodreads.com/book/show/65211701-s...
4397,4397,9781496737298,"Georgie, All Along",,,Kate Clayborn,Kensington,English,A wise and witty new novel that echoes with ti...,340,Paperback,"['Romance', 'Fiction', 'Contemporary', 'Contem...","January 24, 2023",3.82,48031.0,6590.0,3180.0,76300.0,0.00,https://www.goodreads.com/book/show/60604190-g...
4398,4398,9780525619994,Sword Catcher,Sword Catcher,1,Cassandra Clare,Del Rey Books,English,"In the vibrant city-state of Castellane, the r...",624,Hardcover,"['Fantasy', 'Romance', 'Adult', 'Young Adult',...","October 10, 2023",3.84,14991.0,3523.0,4485.0,149000.0,14.99,https://www.goodreads.com/book/show/36679274-s...


In [6]:
df = df[:1000]
df

Unnamed: 0.1,Unnamed: 0,isbn,title,series_title,series_release_number,authors,publisher,language,description,num_pages,format,genres,publication_date,rating_score,num_ratings,num_reviews,current_readers,want_to_read,price,url
0,0,9780689830594,Summer Story,Brambly Hedge,2,Jill Barklem,Atheneum,English,It was such a hot summer. The sky was deep blu...,32,Hardcover,"['Picture Books', 'Childrens', 'Fiction', 'Ani...","January 1, 1980",4.45,1017.0,74.0,7.0,512.0,3.49,https://www.goodreads.com/book/show/421572.Sum...
1,1,9780375704970,The Lake of Darkness,,,Ruth Rendell,Vintage Crime/Black Lizard,English,Martin Urban is a quiet bachelor with a comfor...,210,Paperback,"['Mystery', 'Fiction', 'Crime', 'Thriller', 'B...","January 1, 1980",3.76,1388.0,114.0,77.0,623.0,4.99,https://www.goodreads.com/book/show/83394.The_...
2,2,9780345446671,Beyond the Blue Event Horizon,Heechee Saga,2,Frederik Pohl,Ballantine Books,English,"In Book Two of the Heechee Saga, Robinette Bro...",336,Paperback,"['Science Fiction', 'Fiction', 'Space Opera', ...","January 1, 1980",3.95,13307.0,339.0,181.0,3961.0,11.99,https://www.goodreads.com/book/show/373399.Bey...
3,3,9780446403016,St. Peter's Fair,Chronicles of Brother Cadfael,4,Ellis Peters,Mysterious Press,English,A pause in the civil war offers Shrewsbury's t...,217,Mass Market Paperback,"['Mystery', 'Historical Fiction', 'Fiction', '...","May 1, 1981",4.12,10493.0,593.0,1298.0,2502.0,0.00,https://www.goodreads.com/book/show/751755.St_...
4,4,9780425198773,Twice Shy,,,Dick Francis,G.P. Putnam's Sons,English,A computerized horse-betting system falls into...,304,Mass Market Paperback,"['Mystery', 'Fiction', 'Thriller', 'Crime', 'S...","January 1, 1981",3.92,4188.0,174.0,162.0,642.0,8.99,https://www.goodreads.com/book/show/103250.Twi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,9783857911538,Schweiz Ohne Armee?: Ein Palaver,,,Max Frisch,Limmat Verlag,German,German,93,Hardcover,"['Nonfiction', 'Politics', 'Plays']","January 1, 1989",3.68,68.0,4.0,,33.0,,https://www.goodreads.com/book/show/6325183-sc...
996,996,9781933397269,Death Of A Hollow Man,Chief Inspector Barnaby,2,Caroline Graham,Felony & Mayhem,English,"Actors do love their dramas, and the members o...",306,Paperback,"['Mystery', 'Fiction', 'Crime', 'Mystery Thril...","January 1, 1989",3.87,3635.0,306.0,630.0,1759.0,1.99,https://www.goodreads.com/book/show/454571.Dea...
997,997,9780099742005,Total Recall,,,Piers Anthony,Arrow Books,English,"Implanted with a synthetic memory, a man begin...",224,Paperback,"['Science Fiction', 'Fiction', 'Fantasy', 'Sci...","January 1, 1989",3.93,8684.0,106.0,22.0,2618.0,,https://www.goodreads.com/book/show/15512.Tota...
998,998,9780912333397,Around the World in 80 Days: Companion to the ...,,,Michael Palin,Bay Books,English,The British actor and humorist describes his e...,262,Paperback,"['Travel', 'Nonfiction', 'Audiobook', 'Adventu...","February 1, 1990",4.03,7326.0,262.0,126.0,3859.0,5.99,https://www.goodreads.com/book/show/221304.Aro...


### Tokenize the Dataset

In [7]:
from datasets import Dataset

col_name = "description"

def map_dataset(batch):
    texts = [str(x) if x is not None else "" for x in batch[col_name]]
    return tokenizer(
        texts,
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )


dataset = Dataset.from_pandas(df)
dataset = dataset.map(
    map_dataset,
    batched=True,
    batch_size=8,
    remove_columns=list(df.columns)
)
dataset = dataset.remove_columns(["overflow_to_sample_mapping"])
dataset = dataset.train_test_split(test_size=0.2)
dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 2599.12 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1414
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 354
    })
})

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), mlm=False, mlm_probability=0.15, mask_replace_prob=0.8, random_replace_prob=0.1, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt', seed=None)

### Training the Model
(the output directory is not included in the repo as the files are quite large)

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output/model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10,
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_args,
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,4.151721
2,No log,4.045654
3,4.141300,4.027944
4,4.141300,4.023036
5,4.141300,4.029499
6,3.809600,4.034629
7,3.809600,4.036207
8,3.809600,4.039639
9,3.686300,4.042989
10,3.686300,4.043684


TrainOutput(global_step=1770, training_loss=3.844014933957892, metrics={'train_runtime': 251.1512, 'train_samples_per_second': 56.301, 'train_steps_per_second': 7.048, 'total_flos': 461419440832512.0, 'train_loss': 3.844014933957892, 'epoch': 10.0})

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
finetuned_model_path = "./output/model/checkpoint-1750"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(finetuned_model_path)

prompt = "A thrilling mystery novel about"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95
)

output_string = tokenizer.batch_decode(outputs)

output_string

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['A thrilling mystery novel about a young girl and her obsession with adventure, suspense, and violence; one of two masterpieces in the world of comic book fiction; and one of the most successful and influential comics ever. It is a novel of the mind at large. And it has earned the Academy Award.\nThe author of The Man in the White Stripes, Raymond K. Heinlein, is the author of "Universe #1 and #2." In addition to his work as the writer of "The Man in']