In [1]:
from datasets import load_dataset, DatasetDict

  ## Parameters & Setup

In [2]:
#setup our logging level and file
import logging    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configure the logger if it hasn't been configured before
if not logger.handlers:
    handler = logging.FileHandler('training.log')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)



In [3]:
dataset_path='omgbobbyg/spock'
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModelForCausalLM

# mobilebert
# NO AutoModelForCausalLM
# YES AutoModelForPreTraining
# https://huggingface.co/google/mobilebert-uncased
# pretrained_model = "google/mobilebert-uncased"

# tinybert
# YES AutoModelForCausalLM
# YES AutoModelForPreTraining
# https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D
# pretrained_model = "huawei-noah/TinyBERT_General_4L_312D"

# gemma-7b  RESTRICTED
# NO AutoModelForCausalLM
# NO AutoModelForPreTraining
# https://huggingface.co/google/gemma-7b
# pretrained_model = "google/gemma-7b"

# zephyr-7b-beta
# YES AutoModelForCausalLM
# NO AutoModelForPreTraining
# https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
pretrained_model = "HuggingFaceH4/zephyr-7b-beta"

# TinyLlama
# YES AutoModelForCausalLM
# ? AutoModelForPreTraining
# https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0
# pretrained_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"


finetuned_modelname = "zephyr-spock"
huggingface_username = "PrimeLens"
huggingface_reponame = f"{huggingface_username}/{finetuned_modelname}"  

try:    
    model = AutoModelForCausalLM.from_pretrained(pretrained_model)
    # model = AutoModelForPreTraining.from_pretrained(pretrained_model)
    print(f"The model {pretrained_model} is available in the Hugging Face model hub.")
except Exception as e:
    print(f"The model {pretrained_model} is not available in the Hugging Face model hub.")
    print(f"Error message: {str(e)}")
    

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

The model HuggingFaceH4/zephyr-7b-beta is available in the Hugging Face model hub.


In [4]:
#setup parameters regarding GPU availibility on the machine and recycle used memory
import torch; 
import gc;

is_gpu_available = torch.cuda.is_available()
device = 'cuda' if is_gpu_available else 'cpu'
if is_gpu_available:
    print("GPU available for notebook")
    torch.cuda.empty_cache()
    print("GPU Memory cleaned")
else:
    print("No GPU available for notebook")
    
gc.collect()


No GPU available for notebook


109

## Dataset

In [5]:
dataset  = load_dataset(dataset_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'original_airdate', 'production_number', 'dialogue'],
        num_rows: 3476
    })
    validation: Dataset({
        features: ['title', 'original_airdate', 'production_number', 'dialogue'],
        num_rows: 869
    })
})


  ## Tokenization

In [None]:

# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

from transformers import AutoTokenizer, AutoConfig

model = AutoModelForCausalLM.from_pretrained(pretrained_model)

# Get the maximum context size
max_length = model.config.max_position_embeddings
print(f"Maximum context size: {max_length}")



tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
from transformers import GPT2Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

def tokenize_function(examples):
    return tokenizer(examples["dialogue"],max_length=max_length)


# Apply the tokenization function to the entire dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=10,
    remove_columns=dataset["train"].column_names
)




In [None]:
sample_token = tokenizer.encode("Live long and prosper.")
print(sample_token)

## Data Collator

In [None]:
#We need to create data collator to manage the batches, we can use DataCollatorForLanguageModeling
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token ="<pad>" #tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)
# Iterate over the generator
out = data_collator([tokenized_dataset["train"][i] for i in range(1)])
for key in out:
    print(f"{key} shape: {out[key].shape}")



## Setup the Trainer

In [None]:
#Now we train the model using the Trainer API
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    finetuned_modelname,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=is_gpu_available,
    push_to_hub=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    hub_model_id=huggingface_reponame
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

## Evaluate the Performance of the Base Model

In [None]:
import math

#Calculate and report on perplexity
initial_results = trainer.evaluate()
print(initial_results)
#log the results to file
logger.info(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")
print(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")


In [None]:
#setup our test prompts
test_prompt = "What is the meaning of life?"
test_prompt2 = "Where did that planet go??"
test_prompt3 = "What is the best way to cook a turkey?"

In [None]:
#Use the model in a pipeline to generate text.
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)


result = text_generator(test_prompt, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")

In [None]:
result = text_generator(test_prompt2, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")

In [None]:
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")

## Fine-Tune the Model

In [None]:
trainer.train()

## Evaluate the Performance of the Fine-Tuned Model

In [None]:
#Calculate and report on perplexity
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity

logger.info(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")
print(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")



In [None]:
#Prompt Test 1
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
result = text_generator(test_prompt, max_length=100, num_return_sequences=1,temperature=0.7)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")

In [None]:
#Prompt Test 2
result = text_generator(test_prompt2, max_length=100, num_return_sequences=1,temperature=1)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt2}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt2}...{result[0]['generated_text']}")

In [None]:
#Prompt Test 3
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt3}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt3}...{result[0]['generated_text']}")