In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from datasets import Dataset,load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_pt = 'google-t5/t5-small'
# model_pt = 'FacebookAI/roberta-base'
# model_pt = 'google/flan-t5-xxl'
# model_pt = 'google-t5/t5-large'
tg = pipeline(task='text-generation',model=model_pt)

print(tg('How to cook something?'))




The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMe

[{'generated_text': 'How to cook something?omething? cook something? cooking cooking cooking? cooking cooking? cooking cooking?'}]


In [3]:
df = pd.read_csv('./datasets/train_data.csv')
df = df.iloc[:100,:]
dataset = Dataset.from_pandas(df)

# Split the dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['Input', 'Output'],
        num_rows: 80
    })
    test: Dataset({
        features: ['Input', 'Output'],
        num_rows: 20
    })
})

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_pt)
def tokenize_function(examples):
    inputs = tokenizer(examples['Input'], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(examples['Output'], padding="max_length", truncation=True, max_length=512)
    inputs['labels'] = outputs['input_ids']
    return inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 2376.46 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 1828.10 examples/s]


In [5]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pt)

# Remove columns that the model doesn't expect and set the format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["Input", "Output"])
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8, collate_fn=data_collator)

In [6]:
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(model_pt)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvishwateja2684[0m ([33mvishwa-teja[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 40/40 [07:05<00:00, 10.64s/it]

{'train_runtime': 429.0224, 'train_samples_per_second': 0.932, 'train_steps_per_second': 0.093, 'train_loss': 10.405975341796875, 'epoch': 5.0}





TrainOutput(global_step=40, training_loss=10.405975341796875, metrics={'train_runtime': 429.0224, 'train_samples_per_second': 0.932, 'train_steps_per_second': 0.093, 'total_flos': 54136720588800.0, 'train_loss': 10.405975341796875, 'epoch': 5.0})

In [17]:
# import torch
# model.eval()

# # Function to generate text from input
# def generate_text(input_text):
#     # Tokenize the input text
#     input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids
#     # Generate the output
#     with torch.no_grad():
#         generated_ids = model.generate(input_ids, max_length=512)
#     # Decode the generated tokens to text
#     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
#     return generated_text

# # Load a few examples from the test dataset
# test_examples = tokenized_datasets["test"].shuffle().select(range(5))

# # Generate outputs for these examples
# for example in test_examples:
#     input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
#     generated_text = generate_text(input_text)
#     reference_output = tokenizer.decode(example['labels'], skip_special_tokens=True)
    
#     print(f"Input: {input_text}")
#     print(f"Generated Output: {generated_text}")
#     print(f"Reference Output: {reference_output}")
#     print("-" * 50)

model.eval()

# Function to generate text from input
def generate_text(input_text, max_length=512, min_length=100, num_beams=5, repetition_penalty=2.5):
    # Tokenize the input text
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=1024).input_ids
    # Generate the output
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids, 
            max_length=max_length, 
            min_length=min_length, 
            num_beams=num_beams, 
            repetition_penalty=repetition_penalty,
            early_stopping=True  # To stop when an EOS token is generated
        )
    # Decode the generated tokens to text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text

# Test the model on some examples
test_examples = tokenized_datasets["test"].shuffle().select(range(5))

for example in test_examples:
    input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    generated_text = generate_text(input_text,repetition_penalty=5.0)
    reference_output = tokenizer.decode(example['labels'], skip_special_tokens=True)
    
    print(f"Input: {input_text}")
    print(f"Generated Output: {generated_text}")
    print(f"Reference Output: {reference_output}")
    print("-" * 50)

Input: Kesar Mango Lassi Recipe - Saffron Mango Lassi Recipe
Generated Output: Kesar Mango Lassi Recipe - Saffron Mango Lassi Recipe - Kesar Mango Lassi Recipe - Saffron Mango Lassi Recipe - Kesar Mango Lassi Recipe - Kesar Mango Lassi Recipe - Saffron Mango Lassi Recipe - Kesar Mango Lassi Recipe - Kesar Mango Lassi Recipe - Saffron Mango Lassi Recipe
Reference Output: 3/4 cup Mango Pulp (Puree),1 cup Curd (Dahi / Yogurt) - (low fat),1 tablespoon Sugar - or honey,2 pinch Saffron strands,3 tablespoons Badam (Almond) - roughly chopped,3 tablespoons Milk - luke warm+more cold milk as needed,Ice cubes - a few To begin making Kesar Mango Lassi Recipe, soak one pinch kesar in 2 tablespoons warm milk and keep it aside for few minutes.Get prep with other ingredients as well. Take out the pulp from the mangoes and keep aside.In a blender, add mango pulp, curd/yogurt, cold milk and blend till combined.Once it is done, add chopped almonds (2 tablespoons), honey or sugar, some ice cubes and blend