In [2]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load CSV file
data = pd.read_csv(r"C:\Users\Raunak Jha\House_of_Codes\Python\transformer_fine_tuning_dataset.csv")

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data)

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenization function
def tokenize_data(batch):
    input_encodings = tokenizer(batch["Input"], padding="max_length", truncation=True, max_length=50)
    output_encodings = tokenizer(batch["Output"], padding="max_length", truncation=True, max_length=50)
    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": output_encodings["input_ids"],
    }

# Tokenize the dataset
tokenized_dataset = hf_dataset.map(tokenize_data, batched=True)

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Training arguments
training_args = TrainingArguments(
    output_dir="./t5_fine_tuned",
    evaluation_strategy="no",  # No separate evaluation dataset
    logging_steps=100,
    save_steps=500,
    per_device_train_batch_size=8,
    num_train_epochs=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    warmup_steps=200,
    logging_dir="./logs",
    push_to_hub=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Use the whole dataset for training
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Example test input
test_input = "I want to create a new sheet."
encoded_input = tokenizer(test_input, return_tensors="pt", truncation=True, padding=True)

# Generate output
outputs = model.generate(input_ids=encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"])
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Command:", decoded_output)


Map: 100%|██████████| 100/100 [00:00<00:00, 2305.39 examples/s]
  trainer = Trainer(


Step,Training Loss
100,12.081
200,1.8057
300,0.1987
400,0.038
500,0.0219
600,0.0177


Generated Command: create_sheet
