In [2]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [63]:
#loading GPT-2 model
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import math
import torch
from datasets import load_dataset

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

input_text = "What did Daemon Targaryen say to Rhaenyra about living life in fear?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_length=70, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What did Daemon Targaryen say to Rhaenyra about living life in fear?

"I'm not afraid of death. I'm afraid of death. I'm afraid of death. I'm afraid of death. I'm afraid of death. I'm afraid of death. I'm afraid of death. I'm afraid of death


Model does not have a lot of context, puts attention on "fear" and associates it with death

In [64]:
#fine tuning GPT-2

#dataset https://huggingface.co/datasets/oidlabs/simpleQA
dataset = load_dataset("oidlabs/simpleQA")['train'].select(range(500))

def format_example(example):
    problem = example['problem'].strip()
    answer = example['answer'].strip()
    return {"text": f'{problem} {answer}'}

formatted_dataset = dataset.map(format_example, remove_columns=["metadata", "problem", "answer"])

for i in range(3):
    print(formatted_dataset[i]['text'])

Who received the IEEE Frank Rosenblatt Award in 2010? Michio Sugeno
Who was awarded the Oceanography Society's Jerlov Award in 2018? Annick Bricaud
What's the name of the women's liberal arts college in Cambridge, Massachusetts? Radcliffe College


In [65]:
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

'''def chunk_example(example, chunk_length=256):
    input_ids = example['input_ids']
    return {
        'input_ids': [input_ids[i:i + chunk_length] for i in range(0, len(input_ids), chunk_length)],
        'attention_mask': [example['attention_mask'][i:i + chunk_length] for i in range(0, len(example['attention_mask']), chunk_length)]
    }

chunked_dataset = tokenized_dataset.map(
    chunk_example,
    batched=True,
    remove_columns=["text"]
)

chunked_dataset = chunked_dataset.map(
    lambda x: {"input_ids": list(chain(*x["input_ids"])), "attention_mask": list(chain(*x["attention_mask"]))},
    batched=True
)'''

In [67]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    report_to="none",
    save_steps=50,
    save_total_limit=0,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
10,3.5805
20,3.5782
30,3.662
40,3.8493
50,4.068
60,4.0642
70,3.7515
80,3.7365
90,3.5989
100,3.6694


TrainOutput(global_step=124, training_loss=3.7345210506070043, metrics={'train_runtime': 2856.6564, 'train_samples_per_second': 0.35, 'train_steps_per_second': 0.043, 'total_flos': 64539131904000.0, 'train_loss': 3.7345210506070043, 'epoch': 1.976})

In [68]:
model.eval()

def compute_perplexity(example):
    input_ids = tokenizer(example["text"], return_tensors="pt", truncation=True, max_length=512).input_ids.to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        return {"perplexity": math.exp(loss.item())}

perplexity_scores = formatted_dataset.select(range(10)).map(compute_perplexity)
for i, example in enumerate(perplexity_scores):
    print(f"{i+1}. Perplexity: {example['perplexity']:.2f}")
    print(f"   Text: {example['text']}\n")

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

1. Perplexity: 54.45
   Text: Who received the IEEE Frank Rosenblatt Award in 2010? Michio Sugeno

2. Perplexity: 61.25
   Text: Who was awarded the Oceanography Society's Jerlov Award in 2018? Annick Bricaud

3. Perplexity: 9.31
   Text: What's the name of the women's liberal arts college in Cambridge, Massachusetts? Radcliffe College

4. Perplexity: 100.26
   Text: In whose honor was the Leipzig 1877 tournament organized? Adolf Anderssen

5. Perplexity: 63.95
   Text: According to Karl Küchler, what did Empress Elizabeth of Austria's favorite sculpture depict, which was made for her villa Achilleion at Corfu? Poet Henrich Heine.

6. Perplexity: 32.82
   Text: How much money, in euros, was the surgeon held responsible for Stella Obasanjo's death ordered to pay her son? 120,000

7. Perplexity: 8.76
   Text: What were the month and year when Obama told Christianity Today, "I am a Christian, and I am a devout Christian. I believe in the redemptive death and resurrection of Jesus Christ"?

In [74]:
input_text = "What did Daemon Targaryen say to Rhaenyra about living life in fear?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_length=70, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What did Daemon Targaryen say to Rhaenyra about living life in fear? "I am afraid of death, and I am afraid of the day when I die."

The first episode of Season 5 of Game of Thrones was written by David Benioff and directed by J.R. R. Martin. The episode was


In [81]:
input_text = "What is the name of the kibbutz that Simon Le Bon lived on in 1978?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_length=70, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the name of the kibbutz that Simon Le Bon lived on in 1978? Kibbutz Kibbutz.

The name of the kibbutz that Simon Le Bon lived on in 1978? Kibbutz Kibbutz.

The name of the kibbutz that Simon Le Bon


In [83]:
input_text = "Why did David Chow come to Genoa City on 'The Young and the Restless'?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_length=70, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Why did David Chow come to Genoa City on 'The Young and the Restless'? He was born in Genoa City, Italy, on May 24, 1823. He was the son of the late Dr. David Chow, who was the first Italian physician to study the effects of alcohol on the brain. He was educated at the University of
