# **Colab Notebook for generating a synthetic dataset and finetuning t5 on it.**

**Install Dependencies**

In [None]:
!pip install -q transformers datasets sentencepiece accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

**Step 2**: Generate Our synthetic dataset, bible verses will almost never be said alone, theres always some sorrounding context, the point of this is to artificially generate such sorrounding context.

In [None]:
import random
import json
import pandas as pd
from datasets import Dataset

# 1. Setup Data for Generation
BIBLE_BOOKS = [
    "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth",
    "1 Samuel", "2 Samuel", "1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra",
    "Nehemiah", "Esther", "Job", "Psalms", "Proverbs", "Ecclesiastes", "Song of Solomon",
    "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea", "Joel", "Amos",
    "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah",
    "Malachi", "Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians",
    "2 Corinthians", "Galatians", "Ephesians", "Philippians", "Colossians",
    "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon",
    "Hebrews", "James", "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude", "Revelation"
]

# Random context sentences to wrap around the reference
PREFIXES = [
    "Let's open our bibles to", "The pastor read from", "I love the verse",
    "Consider what is written in", "Have you read", "Specifically", "It says in",
    "Reference:", "My favorite passage is", "Look at"
]

SUFFIXES = [
    "which is beautiful.", "today.", "for inspiration.", ".", "!",
    "and meditate on it.", "in the KJV.", "before we pray."
]

def generate_synthetic_sample():
    """Generates a random sentence and its corresponding JSON label."""

    book = random.choice(BIBLE_BOOKS)
    chapter = random.randint(1, 150) # Max 150 cover Psalms
    verse = random.randint(1, 176)   # Max cover Ps 119

    # Decide on format variation
    format_type = random.choice(['standard', 'verbose', 'short', 'chapter_only'])

    text_ref = ""
    label_dict = {"Book": book, "Chapter": str(chapter), "Verse": str(verse)}

    if format_type == 'standard':
        # Ex: John 3:16
        text_ref = f"{book} {chapter}:{verse}"

    elif format_type == 'verbose':
        # Ex: John Chapter 3 verse 16
        text_ref = f"{book} Chapter {chapter} verse {verse}"

    elif format_type == 'short':
        # Ex: 1 Cor 3 v 16 (Abbreviated/Loose)
        # Simple fuzzy abbreviation logic
        short_book = book[:3] if len(book) > 4 else book
        text_ref = f"{short_book} {chapter} v {verse}"
        # Update label to full book name despite input being short
        label_dict['Book'] = book

    elif format_type == 'chapter_only':
        # Ex: Psalm 23
        text_ref = f"{book} {chapter}"
        label_dict['Verse'] = None # No verse mentioned

    # Construct full sentence
    input_text = f"{random.choice(PREFIXES)} {text_ref} {random.choice(SUFFIXES)}"

    # Format Target as JSON string
    target_text = json.dumps(label_dict)

    return input_text, target_text

# Generate Dataset
DATASET_SIZE = 5000  # Number of examples
data = []

print(f"Generating {DATASET_SIZE} synthetic examples...")
for _ in range(DATASET_SIZE):
    inp, out = generate_synthetic_sample()
    data.append({"input_text": f"extract bible reference: {inp}", "target_text": out})

# Convert to Pandas then HuggingFace Dataset
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# Split into Train/Test
dataset = dataset.train_test_split(test_size=0.1)

print("Sample Data:")
print(dataset['train'][0])

Generating 5000 synthetic examples...
Sample Data:
{'input_text': 'extract bible reference: Specifically Matthew 82 for inspiration.', 'target_text': '{"Book": "Matthew", "Chapter": "82", "Verse": null}'}


**Step 3**: Ready our base model (**T5 small**)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

# 1. Load Model & Tokenizer
MODEL_CHECKPOINT = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

# 2. Preprocessing Function
def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize targets
    labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 3. Apply Tokenization
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 4. Data Collator (Handles padding dynamically)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

**Step 4**: Train the model on our synthetic dataset

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define Hyperparameters
training_args = Seq2SeqTrainingArguments(
    output_dir="./bible_extractor_t5",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10, # 3 Epochs is usually enough for synthetic data
    predict_with_generate=True,
    logging_steps=100,
    fp16=True, # Enable mixed precision for GPU speedup
    report_to="none" # Disable logging to wandb
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start Training
trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,0.0241,0.00288
2,0.0055,0.001106
3,0.0036,0.001363
4,0.0027,0.001394
5,0.0033,0.000708
6,0.0023,0.000505
7,0.0018,0.000676
8,0.0018,0.000882
9,0.0014,0.000704
10,0.0014,0.000781


TrainOutput(global_step=2820, training_loss=0.020817871346858377, metrics={'train_runtime': 299.9424, 'train_samples_per_second': 150.029, 'train_steps_per_second': 9.402, 'total_flos': 283633064214528.0, 'train_loss': 0.020817871346858377, 'epoch': 10.0})

Step 5: Okay, now we've trained, the training loss looks promising!, it's time to test our model.

In [None]:
import json
import torch

# 1. Setup Device (Ensure we use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

def extract_reference(text):
    # Add our instruction prompt
    input_text = f"extract bible reference: {text}"

    # Tokenize & Move to GPU/CPU
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    # Generate
    outputs = model.generate(inputs, max_length=128)

    # Decode
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return decoded_output

# --- TEST CASES ---
test_sentences = [
    "Yes! God loves us so much, lets look at John 3 verse 16",
    "Open your bibles to Genesis 1:1 right now.",
    "The pastor was reading from 1 Kings Chapter 12 verse 4",
    "I was reading Matthew 5 yesterday.",
    "Check out Rev 20:10",
    "Sometimes people mention random things without verses."
    "I belive we covered this last Sunday, lets look at Genesis, I think it was 1 verse 23" ## a missing , ended up resulting in a valuable test case!
]

print("-" * 60)
for sentence in test_sentences:
    # Get the raw string from the model
    raw_output = extract_reference(sentence)

    print(f"Input:      {sentence}")
    print(f"Raw Output: {raw_output}") # <--- Prints regardless of validity

    # Optional: We try to parse it just to see if it worked,
    # but we won't hide the output if it fails.
    try:
        parsed = json.loads(raw_output)
        print(f"Parsed Object: {parsed}")
    except json.JSONDecodeError:
        print(">> Note: Output is not valid JSON yet.")

    print("-" * 60)

------------------------------------------------------------
Input:      Yes! God loves us so much, lets look at John 3 verse 16
Raw Output: "Book": "John", "Chapter": "3", "Verse": "16"
>> Note: Output is not valid JSON yet.
------------------------------------------------------------
Input:      Open your bibles to Genesis 1:1 right now.
Raw Output: "Book": "Genesis", "Chapter": "1", "Verse": null
>> Note: Output is not valid JSON yet.
------------------------------------------------------------
Input:      The pastor was reading from 1 Kings Chapter 12 verse 4
Raw Output: "Book": "1 Kings", "Chapter": "12", "Verse": "4"
>> Note: Output is not valid JSON yet.
------------------------------------------------------------
Input:      I was reading Matthew 5 yesterday.
Raw Output: "Book": "Matthew", "Chapter": "5", "Verse": null
>> Note: Output is not valid JSON yet.
------------------------------------------------------------
Input:      Check out Rev 20:10
Raw Output: "Book": "Revelati

# **Conclusion(s)**


**Training**

Our training loss

Epoch 1 > 0.024100


Epoch 2 > 0.005500


Epoch 3 > 0.003600


Epoch 4 > 0.002700


Epoch 5 > 0.003300


Epoch 6 > 0.002300


Epoch 7 > 0.001800


Epoch 8 > 0.001800


Epoch 9 > 0.001400


Epoch 10 > 0.001400

Our training loss **reaches its lowest on Epoch 9**, meaning that the model had already converged at this point and the last epoch was unecessary(but not destructive!).


**Testing**

Our model performs excellently on the test cases, easily smoking out all the bible book/chapters/verses from the text in our test cases.

**But**, you'll notice our model completely ignores the curly brackets in our training data and as such, its ouput is not directly valid json, apperently this token trimming is a common behaviour in small models attempting to be ultra efficient.

In [None]:
# Save the final model and tokenizer
save_directory = "./bible_t5_model_v1"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

# Optional: Zip it to download to your local machine
!zip -r bible_model.zip {save_directory}
from google.colab import files
files.download("bible_model.zip")