In [None]:
import torch
torch.cuda.is_available()

# Declare a T5 model

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Loading preprocessed Data
Here I loaded the csv and created a ToxicDetoxDataset to use further during training.

In [None]:
refined_df = pd.read_csv('../data/interim/refined.csv', delimiter='\t')

In [None]:
from torch.utils.data import Dataset, DataLoader

class ToxicDetoxDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=32):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        toxic, non_toxic = self.data.iloc[idx][['toxic', 'non_toxic']]
        input_text = f"paraphrase: {toxic}"
        target_text = non_toxic

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer.encode(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets.squeeze()
        }

# Example of creating DataLoader objects
dataset = ToxicDetoxDataset(refined_df, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Train the network

In [None]:
from torch.optim import Adam
from tqdm import tqdm
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = Adam(model.parameters(), lr=5e-5)

epochs = 3

for epoch in range(epochs):
    model.train()
    start_time = time.time()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", position=0, leave=True)
    for batch in progress_bar:
        optimizer.zero_grad()

        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)}, refresh=True)

    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    print(f"Time: {epoch_mins}m {epoch_secs}s")

# Visualizing cross attention
The x-axis represents the input tokens (the sequence fed into the encoder).
The y-axis represents a particular token in the output sequence that the model is trying to generate.
The colors in the heatmap correspond to the weights of the attention mechanism.
If a cell in the heatmap is warm, it means that the output token (y-axis) is strongly attending to that particular input token (x-axis) at the current step in the generation process.

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import matplotlib.pyplot as plt
import seaborn as sns

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and the model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('../models/model')
model.to(device)
model.eval()
model.config.output_attentions = True

phrase = "paraphrase: Shut the fuck up your mouth"

inputs = tokenizer(
    phrase,
    return_tensors="pt",
).to(device)

# Generate the output and ensure attention is returned
output = model.generate(**inputs, return_dict_in_generate=True, output_attentions=True)

# Get the encoder and decoder texts
encoder_text = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
decoded_ids = output.sequences[0]
decoder_text = tokenizer.convert_ids_to_tokens(decoded_ids, skip_special_tokens=True)

# Extract cross-attention weights (assuming you are visualizing the first layer's attention)
layer = 0
 # Index for the first token. You can iterate over all tokens if needed

# Extract the attention weights for the first layer, averaging across all heads
cross_attention = output.cross_attentions[layer][0].mean(dim=0).detach().cpu().numpy()

# Reshape the cross_attention to remove any additional dimensions
cross_attention = cross_attention.squeeze()

# Ensure it's 2D
if cross_attention.ndim == 1:
    cross_attention = cross_attention[None, :]

# Plotting the heatmap
plt.figure(figsize=(12, 10))
ax = sns.heatmap(cross_attention, annot=True, cmap='viridis', xticklabels=encoder_text, yticklabels=decoder_text, fmt=".2f")
plt.xlabel('Input Sequence')
plt.ylabel('Output Sequence')
plt.title(f'Cross-Attention Weights for the Tokens of Output')
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=10, rotation=90)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize=10)
plt.show()


In [None]:
model_save_path = "../models/model/"
model.save_pretrained(model_save_path)