In [1]:
!pip install transformers



In [2]:
import math
from transformers import Trainer, TrainingArguments
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling

In [9]:
# Load the dataset from a .txt file using the tokenizer
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

# Load the data collator for language modeling (handles masking, etc.)
def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

# Main function to fine-tune GPT-2
def train(train_file_path, eval_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    # Load tokenizer and datasets
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    eval_dataset = load_dataset(eval_file_path, tokenizer) if eval_file_path else None  # Load evaluation dataset if provided
    data_collator = load_data_collator(tokenizer)

    # Load pre-trained GPT-2 model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        logging_dir='./logs',  # Log directory
        logging_steps=500,     # Log every 500 steps
        evaluation_strategy="epoch" if eval_dataset else "no",  # Only evaluate if eval dataset is provided
        learning_rate=3e-5,    # Adjust the learning rate here
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,  # Add evaluation dataset here
    )

    # Train the model
    trainer.train()

    # Evaluate the model if evaluation dataset is provided
    if eval_dataset:
        eval_results = trainer.evaluate()
        perplexity = math.exp(eval_results['eval_loss'])
        print(f'Perplexity: {perplexity}')

    # Save the trained model and tokenizer
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)


# Define parameters for training
train_file_path = "/content/Music_Data_CLEANED[standardized].txt"
eval_file_path = "/content/eval.txt"  # Path to your evaluation dataset
model_name = 'gpt2-medium'
output_dir = '/content'
overwrite_output_dir = True
per_device_train_batch_size = 16
num_train_epochs = 10  # Start with 10 epochs and adjust as needed
save_steps = 15  # Adjust based on the dataset size

# Train the model
train(
    train_file_path=train_file_path,
    eval_file_path=eval_file_path,  # Pass the evaluation dataset
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss


In [10]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model from the specified path
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()  # Set the model to evaluation mode
    return model

# Load the tokenizer from the specified path
def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# Generate text using the loaded model and tokenizer
def generate_text(model, tokenizer, sequence, max_length):
    # Encode the input sequence
    ids = tokenizer.encode(sequence, return_tensors='pt')

    # Generate text
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,  # Use tokenizer's eos_token_id
        top_k=50,
        top_p=0.95,
    )

    # Decode the generated output
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)
    return generated_text

# Define model and tokenizer paths
model_path = "/content"
tokenizer_path = model_path  # Assuming the tokenizer is in the same path as the model

# Load model and tokenizer
model = load_model(model_path)
tokenizer = load_tokenizer(tokenizer_path)

# Interactive loop for user input
print("Type 'exit' to quit the loop.")
while True:
    user_input = input("\nEnter your prompt: ")
    if user_input.strip().lower() == 'exit':
        print("Exiting the loop. Goodbye!")
        break

    max_len = 100  # Adjust max length as needed
    response = generate_text(model, tokenizer, user_input, max_len)

    # Display response in a presentable manner
    print("\n--- Generated Response ---")
    print(response)
    print("\n---------------------------")


Type 'exit' to quit the loop.

Enter your prompt: [Q]: What is a musical scale? 


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generated Response ---
[Q]: What is a musical scale? _______________________________________________________________________________ [A]: A musical scale, dear scholar, is a grouping of notes which produce a gradual ascending progression, most musical in its sound.
[Q]: How does a harpsichord work? _______________________________________________________________________________ [A]: A harpsichord, gentle student, is a device by which chords are introduced and played, creating an atmosphere of majesty and depth within a work.
[Q]: What is a tr

---------------------------

Enter your prompt: What is a musical scale? 

--- Generated Response ---
What is a musical scale? 
[Q]: What is a musical scale? [A]: A musical scale, gentle listener, is a sequence of intervals, each step up or down, with a duple tonic at the end, which harmonises with the preceding chord.
[Q]: What is a triad? [A]: A triad, fair maiden, is a grouping of three adjacent notes that form an octave, thus forming the