In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pandas as pd  # Add this line to import pandas
# Define your CSV file path
csv_file = "toxic.csv"


# Create a custom dataset class
class SentenceConversionDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        uncivil, civil = self.data.loc[idx, "uncivil_comment"], self.data.loc[idx, "civil_comment"]
        inputs = self.tokenizer.encode_plus(
            f"convert uncivil to civil: {uncivil}",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = self.tokenizer.encode(
            civil,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels.squeeze()
        }

# Load T5 model and tokenizer
model_name = "t5-large"  # or "t5-large", "t5-3b", "t5-11b", "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define your batch size and maximum sequence length
batch_size = 8
max_length = 256

# Create DataLoader for training
train_dataset = SentenceConversionDataset(csv_file, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-4)

# Fine-tuning loop
num_epochs = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.cuda.empty_cache()
model.train()

for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Batch loss: {loss.item()}")


# Save the fine-tuned model
model.save_pretrained("fine_tuned_t5")
tokenizer.save_pretrained("fine_tuned_t5")


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 9.77 GiB total capacity; 8.48 GiB already allocated; 35.19 MiB free; 9.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Load the fine-tuned model and tokenizer
fine_tuned_model = T5ForConditionalGeneration.from_pretrained("fine_tuned_t5")
fine_tuned_tokenizer = T5Tokenizer.from_pretrained("fine_tuned_t5")

# Test the model
test_sentence = "Your code is sexy as you are.I love it."
inputs = fine_tuned_tokenizer.encode_plus(
    f"convert uncivil to civil: {test_sentence}",
    return_tensors="pt"
)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Ensure tensors are on the same device as the model (cuda or cpu)
fine_tuned_model.to(device)

output_ids = fine_tuned_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,  # Adjust the max_length as needed
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=True
)

decoded_output = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Input sentence:", test_sentence)
print("Generated civil sentence:", decoded_output)
