In [None]:
from datasets import load_dataset, DatasetDict

  ## Parameters & Setup

In [None]:
dataset_path='omgbobbyg/spock'
pretrained_model = "distilbert/distilgpt2"
finetuned_modelname = "distilgpt2-spock"
huggingface_username = "omgbobbyg"
huggingface_reponame = f"{huggingface_username}/{finetuned_modelname}"  

In [None]:
#setup parameters regarding GPU availibility on the machine and recycle used memory
import torch; 
import gc;

is_gpu_available = torch.cuda.is_available()
device = 'cuda' if is_gpu_available else 'cpu'
if is_gpu_available:
    print("GPU available for notebook")
    torch.cuda.empty_cache()
    print("GPU Memory cleaned")
else:
    print("No GPU available for notebook")
    
gc.collect()


In [None]:
#setup our logging level and file
import logging    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configure the logger if it hasn't been configured before
if not logger.handlers:
    handler = logging.FileHandler('training.log')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)



## Dataset

In [None]:
dataset  = load_dataset(dataset_path)
print(dataset)

  ## Tokenization

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(pretrained_model)
# Get the maximum context size
max_length = model.config.max_position_embeddings
print(f"Maximum context size: {max_length}")



In [None]:
from transformers import GPT2Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

def tokenize_function(examples):
    return tokenizer(examples["dialogue"],max_length=max_length)


# Apply the tokenization function to the entire dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=10,
    remove_columns=dataset["train"].column_names
)




In [None]:
sample_token = tokenizer.encode("Live long and prosper.")
print(sample_token)

## Data Collator

In [None]:
#We need to create data collator to manage the batches, we can use DataCollatorForLanguageModeling
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token ="<pad>" #tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)
# Iterate over the generator
out = data_collator([tokenized_dataset["train"][i] for i in range(1)])
for key in out:
    print(f"{key} shape: {out[key].shape}")



## Setup the Trainer

In [None]:
#Now we train the model using the Trainer API
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    finetuned_modelname,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=is_gpu_available,
    push_to_hub=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    hub_model_id=huggingface_reponame
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

## Evaluate the Performance of the Base Model

In [None]:
import math

#Calculate and report on perplexity
initial_results = trainer.evaluate()
print(initial_results)
#log the results to file
logger.info(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")
print(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")


In [None]:
#setup our test prompts
test_prompt = "What is the meaning of life?"
test_prompt2 = "Where did that planet go??"
test_prompt3 = "What is the best way to cook a turkey?"

In [None]:
#Use the model in a pipeline to generate text.
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)


result = text_generator(test_prompt, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")

In [None]:
result = text_generator(test_prompt2, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")

In [None]:
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")

## Fine-Tune the Model

In [None]:
trainer.train()

## Evaluate the Performance of the Fine-Tuned Model

In [None]:
#Calculate and report on perplexity
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity

logger.info(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")
print(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")



In [None]:
#Prompt Test 1
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
result = text_generator(test_prompt, max_length=100, num_return_sequences=1,temperature=0.7)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")

In [None]:
#Prompt Test 2
result = text_generator(test_prompt2, max_length=100, num_return_sequences=1,temperature=1)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt2}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt2}...{result[0]['generated_text']}")

In [None]:
#Prompt Test 3
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt3}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt3}...{result[0]['generated_text']}")