<a href="https://colab.research.google.com/github/Priyadharshini-cseAI/PRODIGY_GA/blob/main/PRODIGY_GA_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

MODEL_NAME = "gpt2"
OUTPUT_DIR = "./results"
TRAIN_FILE = "train_data.txt"
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_TRAIN_BATCH_SIZE = 2
BLOCK_SIZE = 32
SAVE_STEPS = 50
SAVE_TOTAL_LIMIT = 2


def create_sample_train_file():
    """Creates a dummy training file if it doesn't exist."""
    if not os.path.exists(TRAIN_FILE):
        print(f"'{TRAIN_FILE}' not found. Creating sample training data...")
        with open(TRAIN_FILE, "w", encoding="utf-8") as f:
            f.write("Hello, this is a sample sentence for training.\n")
            f.write("GPT-2 is a powerful language model.\n")
            f.write("Fine-tuning helps adapt it to specific tasks.\n")
            f.write("This is another example line of text.\n")
            f.write("The model will learn from these examples.\n")
        print("Sample training file created.")


def load_and_tokenize_dataset(file_path, tokenizer):
    """Loads and tokenizes text dataset using ðŸ¤— datasets."""
    dataset = load_dataset("text", data_files=file_path)

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=BLOCK_SIZE
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )

    return tokenized_dataset["train"]


def main():
    create_sample_train_file()

    print(f"Loading tokenizer and model: {MODEL_NAME}")
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
        model.resize_token_embeddings(len(tokenizer))

    print("Loading and tokenizing dataset...")
    train_dataset = load_and_tokenize_dataset(TRAIN_FILE, tokenizer)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    print("Setting up training arguments...")
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=NUM_TRAIN_EPOCHS,
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
        save_steps=SAVE_STEPS,
        save_total_limit=SAVE_TOTAL_LIMIT,
        logging_steps=10,
        logging_dir="./logs",
        report_to="none"
    )

    print("Initializing Trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
    )

    print("Starting fine-tuning...")
    trainer.train()

    print(f"Saving model to {OUTPUT_DIR}")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    print("\n--- Text Generation ---")
    prompt = "Once upon a time"
    print(f"Prompt: {prompt}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        max_length=100,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\nGenerated Text:")
    print(generated_text)


if __name__ == "__main__":
    main()




Loading tokenizer and model: gpt2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading and tokenizing dataset...
Setting up training arguments...
Initializing Trainer...
Starting fine-tuning...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Saving model to ./results


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- Text Generation ---
Prompt: Once upon a time

Generated Text:
Once upon a time, you will be able to see the world through your eyes.

TheWorld is a game that is designed to be played in a way that you can play with your friends. It is also designed for the player to play. You can also play it with friends and play together. The game is for you to learn and learn.
