In [None]:
# part1_prepare_dataset_CLEANED.py

import os
import zipfile
import re # Import the regular expressions library

def clean_gutenberg_text(text):
    """
    A function to remove common Gutenberg headers, footers, and other noise.
    """
    # Remove the main header and footer blocks
    text = re.sub(r'\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*', '', text, flags=re.IGNORECASE)

    # Remove chapter headings (e.g., Chapter 1, CHAPTER II)
    text = re.sub(r'CHAPTER \w+', '', text)
    text = re.sub(r'Chapter \d+', '', text)

    # Remove illustration tags
    text = re.sub(r'\[Illustration.*\]', '', text)

    # Normalize whitespace: replace 3 or more newlines with just two
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()

def process_poetry_folders(zip_path="forms.zip", output_file="poetry_dataset_new.txt"):
    """
    Unzips the dataset and processes the folder structure, now with cleaning.
    """
    extract_dir = "poetry_forms_dataset"
    print(f"Unzipping '{zip_path}' into '{extract_dir}' folder...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    poem_count = 0
    print(f"Processing and CLEANING all text files inside '{extract_dir}'...")

    with open(output_file, "w", encoding="utf-8") as f_out:
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, "r", encoding="utf-8", errors="ignore") as f_in:
                            raw_text = f_in.read()

                            # ---- NEW CLEANING STEP ----
                            cleaned_text = clean_gutenberg_text(raw_text)

                            # Only write non-empty poems
                            if cleaned_text:
                                f_out.write(cleaned_text)
                                f_out.write("\n\n<|endoftext|>\n\n")
                                poem_count += 1
                    except Exception as e:
                        print(f"Could not read file {file_path}. Error: {e}")

    print(f"\n✅ Processing complete. Combined and cleaned {poem_count} poems into '{output_file}'.")

# --- Run the improved dataset preparation ---
process_poetry_folders()

Unzipping 'forms.zip' into 'poetry_forms_dataset' folder...
Processing and CLEANING all text files inside 'poetry_forms_dataset'...

✅ Processing complete. Combined and cleaned 6321 poems into 'poetry_dataset_new.txt'.


In [None]:
# fine_tune_script_updated.py

import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

def fine_tune_poetry_model(
    dataset_path="poetry_dataset_new.txt",
    base_model_name="gpt2",
    output_dir="./gpt2-poetry-finetuned-v2" # Saving to a new directory
):
    """
    Fine-tunes a GPT-2 model on a given text dataset of poems with more epochs
    and optimized training arguments for better convergence.
    """
    # Step 1: Load the tokenizer and model
    print(f"Loading tokenizer for '{base_model_name}'...")
    tokenizer = GPT2Tokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token

    print(f"Loading model '{base_model_name}'...")
    model = GPT2LMHeadModel.from_pretrained(base_model_name)

    # Step 2: Prepare the dataset
    print(f"Loading and preparing the dataset from '{dataset_path}'...")
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=dataset_path,
        block_size=128
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Step 3: Define the UPDATED Training Arguments for better convergence
    print("Setting up optimized training arguments...")

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,

        # --- KEY CHANGES FOR BETTER CONVERGENCE ---
        num_train_epochs=5,                # INCREASED: Train for 5 full cycles of the dataset.
        learning_rate=2e-5,                # REDUCED: A smaller learning rate helps the model fine-tune more carefully.
        warmup_steps=500,                  # ADDED: Gradually increases the learning rate at the start to prevent instability.

        # --- Other important parameters ---
        per_device_train_batch_size=8,     # BATCH SIZE: Increased slightly. Lower to 4 if you get memory errors.
        weight_decay=0.01,                 # ADDED: A regularization technique to prevent overfitting.

        # --- Logging and Saving ---
        logging_steps=500,                 # ADDED: Log the training loss every 500 steps so you can see it go down.
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    # Step 4: Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    # Step 5: Start Fine-Tuning
    print("\n--- Starting Enhanced Fine-Tuning ---")
    trainer.train()
    print("--- Fine-Tuning Complete ---")

    # Step 6: Save the final model
    print(f"Saving the fine-tuned model to {output_dir}...")
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    print(f"\n✅ Model fine-tuning complete! Your new, improved poetry model is saved in '{output_dir}'.")

# --- Run the fine-tuning process ---
# REMINDER: This will take longer than before due to more epochs. A GPU is essential.
fine_tune_poetry_model()

Loading tokenizer for 'gpt2'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading model 'gpt2'...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading and preparing the dataset from 'poetry_dataset_new.txt'...




Setting up optimized training arguments...

--- Starting Enhanced Fine-Tuning ---


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabhigyan-sharma6091[0m ([33mabhigyan-sharma6091-indian-institute-of-technology-bhilai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.976
1000,3.7574
1500,3.7122
2000,3.6477
2500,3.6236
3000,3.5652
3500,3.5497
4000,3.5157
4500,3.4971
5000,3.4783


--- Fine-Tuning Complete ---
Saving the fine-tuned model to ./gpt2-poetry-finetuned-v2...

✅ Model fine-tuning complete! Your new, improved poetry model is saved in './gpt2-poetry-finetuned-v2'.


In [None]:
import os
import zipfile

def zip_folder(folder_path, output_zip_path):
    """
    Creates a zip archive of the specified folder.

    Args:
        folder_path (str): The path to the folder to be zipped.
        output_zip_path (str): The path and filename for the output zip file.
    """
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Calculate the relative path of the file within the folder
                # This ensures the correct directory structure inside the zip
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)
    print(f"Folder '{folder_path}' successfully zipped to '{output_zip_path}'")

# Example usage:
# Create a dummy folder and files for demonstration
if not os.path.exists("my_folder"):
    os.makedirs("my_folder/sub_folder")
    with open("my_folder/file1.txt", "w") as f:
        f.write("This is file 1.")
    with open("my_folder/sub_folder/file2.txt", "w") as f:
        f.write("This is file 2 in a subfolder.")

folder_to_zip = "/content/gpt2-poetry-finetuned-v2"
output_zip_file = "/content/gpt2-poetry-finetuned_v2.zip"

zip_folder(folder_to_zip, output_zip_file)

Folder '/content/gpt2-poetry-finetuned-v2' successfully zipped to '/content/gpt2-poetry-finetuned_v2.zip'


In [None]:
import os

file_path = "/content/gpt2-poetry-finetuned_new.zip"
file_size_bytes = os.path.getsize(file_path)
file_size_mb = file_size_bytes / (1024 * 1024) # Convert bytes to megabytes

print(f"The size of '{file_path}' is {file_size_mb:.2f} MB")

The size of '/content/gpt2-poetry-finetuned_new.zip' is 3065.89 MB
