### 1. Install dependencies

In [23]:
# !pip install transformers datasets evaluate accelerate -q

### 2. Imports


In [24]:
# Add this before any other import
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
from pathlib import Path
import json
import torch

### 3. Load synthetic dataset

In [26]:
with open("../data/synthetic_dataset_v1.json") as f:
    raw_data = json.load(f)

# Format for supervised fine-tuning: input-output pairs
pairs = [
    {
        "input": f"Business: {entry['business_description']} \n Domain Name:",
        "output": f"{entry['category']}-{i}.com"
    }
    for i, entry in enumerate(raw_data)
]


### 4. Convert to Hugging Face Dataset

In [27]:
dataset = Dataset.from_list(pairs)
dataset = dataset.train_test_split(test_size=0.1)

### 5. Load tokenizer and model

In [28]:
model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

### 6. Tokenize the data


In [29]:
def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], max_length=64, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

### 7. Training setup


In [None]:
Path("../models/model_v1").mkdir(parents=True, exist_ok=True)

args = TrainingArguments(
    output_dir=str(Path("../models/model_v1").resolve()),  
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir=str(Path("../models/model_v1/logs").resolve()),
    logging_steps=10,
    save_total_limit=2
)

### 8. Trainer


In [31]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


### 9. Train


In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.7148,1.067017
2,1.1289,1.012311
3,1.0388,1.007025


TrainOutput(global_step=69, training_loss=1.4878929248754529, metrics={'train_runtime': 113.7409, 'train_samples_per_second': 4.748, 'train_steps_per_second': 0.607, 'total_flos': 14085672726528.0, 'train_loss': 1.4878929248754529, 'epoch': 3.0})

### 10. Save model manually


In [33]:
trainer.save_model("../models/model_v1")
tokenizer.save_pretrained("../models/model_v1")

('../models/model_v1\\tokenizer_config.json',
 '../models/model_v1\\special_tokens_map.json',
 '../models/model_v1\\tokenizer.json')