In [1]:
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, tokenizer, max_length):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        source_text = self.source_sentences[idx]
        target_text = self.target_sentences[idx]

        source_encoding = self.tokenizer.encode_plus(
            source_text,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            truncation=True
        )

        target_encoding = self.tokenizer.encode_plus(
            target_text,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Load the dataset
df = pd.read_csv("toxic.csv")
source_sentences = df["uncivil_comment"].tolist()
target_sentences = df["civil_comment"].tolist()

# Load pretrained model and tokenizer
model_name = "facebook/bart-base"  # Use a valid BART model identifier bart-base, bart-large
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Tokenize and encode the data
inputs = tokenizer(source_sentences, padding=True, truncation=True, return_tensors="pt", max_length=256)
targets = tokenizer(target_sentences, padding=True, truncation=True, return_tensors="pt", max_length=256)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bart_output",  # Specify the output directory where the model will be saved
    num_train_epochs=20,
    per_device_train_batch_size=8,
    save_total_limit=2,
    logging_dir="./bart_logs",
)

# Create the custom dataset
train_dataset = CustomDataset(source_sentences, target_sentences, tokenizer, max_length=256)

# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)
trainer.train()

# Save the model explicitly after training
trainer.save_model("./bart_output")




Step,Training Loss
500,0.8181
1000,0.2019
1500,0.1735
2000,0.1483
2500,0.1419
3000,0.1205
3500,0.1123
4000,0.1037
4500,0.0837
5000,0.0825


In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the fine-tuned BART model
fine_tuned_bart_model = BartForConditionalGeneration.from_pretrained("./bart_output")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# Test the model
uncivil_sentence = "Thanks for your contributions. Are you free tonight?"
inputs = tokenizer(uncivil_sentence, return_tensors="pt", max_length=128)
generated_ids = fine_tuned_bart_model.generate(**inputs)
generated_civil_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Civil Sentence:", generated_civil_sentence)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Civil Sentence: Thank you for your contributions. Let's keep this conversation productive.
