In [13]:
!pip install transformers
!pip install torch
!pip install accelerate -U
!pip install datasets



In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from datasets import load_dataset
import torch
from transformers import Trainer, TrainingArguments
import ipywidgets as widgets
from IPython.display import display

In [4]:
# Define your widgets
prompt_input = widgets.Textarea(
    value='',
    placeholder='Type your prompt here',
    description='Prompt:',
    layout={'width': '100%', 'height': '100px'}
)
batch_size_input = widgets.IntText(value=1, description='Batch Size:')
epoch_num_input = widgets.IntText(value=1, description='Epoch Num:')
run_button = widgets.Button(description='Set Values')

# Display your widgets
display(prompt_input, batch_size_input, epoch_num_input, run_button)

# Placeholder variables to store the values
prompt = ''
batch_size = 1
epoch_num = 1

# Define a function to update the variables with current widget values
def on_run_button_clicked(b):
    global prompt, batch_size, epoch_num
    prompt = prompt_input.value
    batch_size = batch_size_input.value
    epoch_num = epoch_num_input.value
    with output:
        print(f"Values set. Prompt: {prompt}, Batch Size: {batch_size}, Epoch Num: {epoch_num}")

# Create an output widget to display the status messages
output = widgets.Output()

run_button.on_click(on_run_button_clicked)
display(output)

Textarea(value='', description='Prompt:', layout=Layout(height='100px', width='100%'), placeholder='Type your …

IntText(value=1, description='Batch Size:')

IntText(value=1, description='Epoch Num:')

Button(description='Set Values', style=ButtonStyle())

Output()

In [37]:
# Load the dataset
dataset = load_dataset("microsoft/orca-math-word-problems-200k")

# Access the training split of the dataset
training_examples = dataset["train"]

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the EOS token

# Specify the maximum length for padding or truncation
max_length = 100

# Example of processing a subset of the data
subset_size = 5000  # Change this as needed
subset_data = training_examples.select(range(subset_size))

# Tokenize the subset of data
tokenized_data = tokenizer(training_examples["question"], padding="max_length", truncation=True, max_length=max_length, return_tensors="pt") # change training_examples to subset_data to only do a certain amount


In [45]:
# Assuming tokenized_data is the result of the tokenizer call on your dataset
input_ids = tokenized_data["input_ids"]
attention_mask = tokenized_data["attention_mask"]

# Convert input_ids and attention_mask to tensors if they are not already
# This step may be redundant if 'return_tensors="pt"' was specified in the tokenizer call
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)

# Labels for language modeling: typically, you would use the input_ids as the labels for such tasks
labels = input_ids.clone()

print("Keys of tokenized_data:", tokenized_data.keys())
# Print lengths of input_ids and attention_mask
print(len(tokenized_data["input_ids"]))
print(len(tokenized_data["attention_mask"]))

# Create a GPT-2 configuration
config = GPT2Config.from_pretrained("gpt2")

# Initialize a GPT-2 model with the specified configuration
model = GPT2LMHeadModel(config)

  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)


Keys of tokenized_data: dict_keys(['input_ids', 'attention_mask'])
200035
200035


In [67]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from datasets import load_dataset

# Define a custom Trainer class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs["input_ids"]
        outputs = model(**inputs, labels=labels)
        return outputs.loss

# Load the model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the EOS token
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tokenize the dataset
dataset = load_dataset("microsoft/orca-math-word-problems-200k")
max_length = 100

def preprocess_data(examples):
    return tokenizer(examples["question"], padding="max_length", truncation=True, max_length=max_length)

# Apply the preprocessing function to the dataset
tokenized_data = dataset.map(preprocess_data, batched=True)

# Initialize Trainer with appropriate TrainingArguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-model",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=10,
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),  # Enable FP16 if compatible GPU is available
)

# Initialize CustomTrainer with the custom compute_loss method
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")


Step,Training Loss
500,1.3482
1000,1.1792
1500,1.1095
2000,1.056
2500,1.0219
3000,0.9913
3500,0.9691
4000,0.9494
4500,0.9227
5000,0.9169


('./fine-tuned-model\\tokenizer_config.json',
 './fine-tuned-model\\special_tokens_map.json',
 './fine-tuned-model\\vocab.json',
 './fine-tuned-model\\merges.txt',
 './fine-tuned-model\\added_tokens.json')

In [None]:
# Specify the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned model and tokenizer
model_name = "gpt2"  # Update with the appropriate model name if different
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Your input question
question = prompt

# Tokenize the input question
input_ids = tokenizer.encode(question, return_tensors="pt")

# Get the length of the input sequence
input_length = len(input_ids[0])

# Generate output with the fine-tuned model
model.config.force_bos_token_to_be_generated = False

# Ensure position_ids are within the valid range
position_ids = torch.arange(input_length, dtype=torch.long, device=device)

# Set a different temperature value
temperature = 0.8

# Mask invalid positions in the attention mask
attention_mask = torch.ones_like(input_ids)
attention_mask[:, input_length:] = 0

# Generate output with the fine-tuned model
generated_ids = model.generate(
    input_ids=input_ids,
    max_length=input_length + 200,
    do_sample=True,
    pad_token_id=model.config.eos_token_id,
    eos_token_id=model.config.eos_token_id,
    attention_mask=attention_mask,
    num_beams=3,
    use_cache=True,
    position_ids=position_ids,
    temperature=temperature,
)

# Extract the generated sequence without the EOS token
generated_sequence = generated_ids[0].tolist()


# Decode the generated sequence
generated_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
print("Input Question:", question)
print("Generated Answer:", generated_text)
