In [None]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, tokenizer, max_length):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        source_text = self.source_sentences[idx]
        target_text = self.target_sentences[idx]

        source_encoding = self.tokenizer.encode_plus(
            source_text,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            truncation=True
        )

        target_encoding = self.tokenizer.encode_plus(
            target_text,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Load the dataset
df = pd.read_csv("toxic.csv")
source_sentences = df["uncivil_comment"].tolist()
target_sentences = df["civil_comment"].tolist()

# Load pretrained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"  # Use a valid MarianMT model identifier
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Tokenize and encode the data
inputs = tokenizer(source_sentences, padding=True, truncation=True, return_tensors="pt", max_length=256)
targets = tokenizer(target_sentences, padding=True, truncation=True, return_tensors="pt", max_length=256)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./marian_output",  # Specify the output directory where the model will be saved
    num_train_epochs=20,
    per_device_train_batch_size=8,
    save_total_limit=2,
    logging_dir="./marian_logs",
)

# Create the custom dataset
train_dataset = CustomDataset(source_sentences, target_sentences, tokenizer, max_length=256)

# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)
trainer.train()

# Save the model explicitly after training
trainer.save_model("./marian_output")


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load the fine-tuned MarianMT model
fine_tuned_marian_model = MarianMTModel.from_pretrained("./marian_output")
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ROMANCE")

# Test the model
uncivil_sentence = "Fuck! What have you done?"
inputs = tokenizer(uncivil_sentence, return_tensors="pt", max_length=128)
generated_ids = fine_tuned_marian_model.generate(**inputs)
generated_civil_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

subword_units = generated_civil_sentence.split('▁')

# Remove empty strings resulting from the split
subword_units = [unit for unit in subword_units if unit]

# Join subword units into words
cleaned_sentence = ' '.join(subword_units)

print("Generated Civil Sentence:", cleaned_sentence)
