In [2]:
!pip install transformers==4.39.3 accelerate==0.27.2 peft==0.10.0 bitsandbytes datasets -q

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import json

In [3]:
dataset_path = "/content/synthetic_dataset_v1.json"

with open(dataset_path, "r") as f:
    data = json.load(f)

print(f"Loaded {len(data)} samples")
print("Sample:", data[0])

Loaded 1000 samples
Sample: {'business_description': 'startup for pets in Singapore', 'expected_domain_names': ['startuppets.biz', 'startuppets.io', 'singaporeio.co']}


In [4]:
def extract_domains(text):
    return re.findall(r"\b[a-zA-Z0-9-]+\.[a-z]{2,}\b", text)

def relevance_score(description, domains):
    keywords = [w.lower() for w in description.split() if len(w) > 3]
    match_count = sum(any(k in d for d in domains) for k in keywords)
    return match_count / max(1, len(keywords))

def creativity_score(description, domains):
    return 1.0 if all(description.lower() not in d for d in domains) else 0.0

In [5]:
train_data = [
    {
        "input_text": f"Generate 3 domain names for this business. Business: {item['business_description']}\nDomains:",
        "target_text": ", ".join(item['expected_domain_names'])
    }
    for item in data
]

dataset = Dataset.from_list(train_data)
dataset = dataset.train_test_split(test_size=0.1)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 100
    })
})


In [6]:
model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
print("Loaded Phi-2 for lora fine tuning")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded Phi-2 for lora fine tuning


In [10]:
def tokenize(batch):
    # Tokenize inputs and targets separately
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=128) # Ensure same max_length

    # The 'labels' are the tokenized target text
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["input_text", "target_text"])
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 5,242,880 || all params: 2,784,926,720 || trainable%: 0.1882591725788749


In [12]:
training_args = TrainingArguments(
    output_dir="./phi2_lora",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
10,6.4322
20,1.6515
30,0.9353
40,0.7533
50,0.7691
60,0.6797
70,0.6441
80,0.6109
90,0.6419
100,0.5881


  return fn(*args, **kwargs)


TrainOutput(global_step=900, training_loss=0.6200552214516534, metrics={'train_runtime': 408.1955, 'train_samples_per_second': 4.41, 'train_steps_per_second': 2.205, 'total_flos': 3668688764928000.0, 'train_loss': 0.6200552214516534, 'epoch': 2.0})

In [13]:
model.save_pretrained("./phi2_lora")
tokenizer.save_pretrained("./phi2_lora")

('./phi2_lora/tokenizer_config.json',
 './phi2_lora/special_tokens_map.json',
 './phi2_lora/vocab.json',
 './phi2_lora/merges.txt',
 './phi2_lora/added_tokens.json',
 './phi2_lora/tokenizer.json')