In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
# Step 1: Load the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [34]:
dataset.keys()

dict_keys(['test', 'train', 'validation'])

In [35]:
# Access the train dataset
train_dataset = dataset['train']
    
# Shuffle the train dataset and display 5 random rows
shuffled_train = train_dataset.shuffle(seed=42)

# Show random rows
for i in range(5):
    print(shuffled_train[i]['text'])

 Continuous , short @-@ arc , high pressure xenon arc lamps have a color temperature closely approximating noon sunlight and are used in solar simulators . That is , the chromaticity of these lamps closely approximates a heated black body radiator that has a temperature close to that observed from the Sun . After they were first introduced during the 1940s , these lamps began replacing the shorter @-@ lived carbon arc lamps in movie projectors . They are employed in typical 35mm , IMAX and the new digital projectors film projection systems , automotive HID headlights , high @-@ end " tactical " flashlights and other specialized uses . These arc lamps are an excellent source of short wavelength ultraviolet radiation and they have intense emissions in the near infrared , which is used in some night vision systems . 

 Field Marshal Antonio José de Sucre is portrayed as an intimate friend of the General . The historical Antonio José de Sucre , the Field Marshal of Ayacucho , had been the 

In [36]:
# Step 2: Load pre-trained GPT-2 tokenizer and model
model_name = "gpt2"  # You can change this to another model if needed (like GPT-Neo)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [37]:
# Test generation before fine-tuning
def generate_text_before(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [38]:
prompt = "Once upon a time a boy"
print("Before fine-tuning:")
print(generate_text_before(prompt))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Before fine-tuning:
Once upon a time a boy was born, he was called a boy. He was called a boy because he was born with a boy's head. He was called a boy because he was born with a boy's head. He was called a boy


In [39]:
# Step 3: Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

In [40]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [41]:
# Step 4: Group the text into chunks and create labels
block_size = 128
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
              for k, t in concatenated_examples.items()}
    
    # Add labels which are the same as input_ids
    result["labels"] = result["input_ids"].copy()
    return result

In [42]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

In [43]:
# Step 5: Training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=7,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Enables mixed precision if GPU supports it
    save_steps=500
)



In [44]:
# Step 6: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [45]:
# Step 7: Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.4614,3.417959
2,3.2702,3.417857
3,3.0814,3.442904
4,2.9669,3.471915
5,2.8423,3.499979
6,2.6502,3.530347
7,2.611,3.552998


TrainOutput(global_step=32669, training_loss=2.941487056740702, metrics={'train_runtime': 2973.0793, 'train_samples_per_second': 43.951, 'train_steps_per_second': 10.988, 'total_flos': 8535692132352000.0, 'train_loss': 2.941487056740702, 'epoch': 7.0})

In [46]:
# Save the fine-tuned model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json',
 './gpt2-finetuned/tokenizer.json')

In [5]:
# Load the fine-tuned GPT-2 model and tokenizer
fine_tuned_model_path = "./gpt2-finetuned"  # Path to your fine-tuned model

tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path)

In [13]:
# Test generation after fine-tuning
def generate_text_after(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_length, num_return_sequences=1, attention_mask=inputs["attention_mask"], eos_token_id=None)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [11]:
print("After fine-tuning:")
prompt = "Once upon a time a boy"
print(generate_text_after(prompt))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


After fine-tuning:
Once upon a time a boy named John, who had been sent to fetch the boy, was abducted by the Templars. John was taken to the Templars'hideout in the mountains, where he was tortured and executed. 
 = =
