In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random, math, time
from sklearn.model_selection import train_test_split


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


In [17]:
import os
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Variant_Apologetic_Emails_with_Abbreviation_Expansion.csv', encoding="latin1")

In [4]:
df.head()

Unnamed: 0,Informal,Apologetic_Variant_Informal,Formal,Apologetic_Variant_Formal
0,dear all due to the delay in ubs windows accou...,I sincerely apologize for the inconvenience. D...,dear all due to the delay in ubs windows accou...,I sincerely apologize for the inconvenience. D...
1,i wanna put this old dog out of its misery bef...,I sincerely apologize for the inconvenience. I...,i would like to put this old dog out of its mi...,I sincerely apologize for the inconvenience. I...
2,hi there want to see some nasty hot action cum...,Sorry for the disruption. Hi there want to see...,hi there want to see some nasty hot action cum...,My apologies for any trouble caused. Hi there ...
3,hello mark i have attached an outline that wil...,I sincerely apologize for the inconvenience. H...,hello mark i have attached an outline that wil...,I sincerely apologize for the inconvenience. H...
4,cst to brett r peter n tracee cc kelly h sarah...,I regret any inconvenience caused. Central sta...,cst to brett r peter n tracee cc kelly h sarah...,I regret any inconvenience caused. Central sta...


In [5]:
# Select only the required columns
selected_df = df[["Apologetic_Variant_Formal", "Formal"]]

# (Optional) Save to a new CSV
selected_df.to_csv("apologetic_formal_only.csv", index=False)

In [6]:
df = pd.read_csv("apologetic_formal_only.csv")

df.head()

Unnamed: 0,Apologetic_Variant_Formal,Formal
0,I sincerely apologize for the inconvenience. D...,dear all due to the delay in ubs windows accou...
1,I sincerely apologize for the inconvenience. I...,i would like to put this old dog out of its mi...
2,My apologies for any trouble caused. Hi there ...,hi there want to see some nasty hot action cum...
3,I sincerely apologize for the inconvenience. H...,hello mark i have attached an outline that wil...
4,I regret any inconvenience caused. Central sta...,cst to brett r peter n tracee cc kelly h sarah...


In [10]:
# Split: 80% train, 10% val, 10% test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df.to_csv("train_formal_ap.csv", index=False)
val_df.to_csv("val_formal_ap.csv", index=False)
test_df.to_csv("test_formal_ap.csv", index=False)


In [11]:
class EmailToneDataset(Dataset):
    def __init__(self, filepath, tokenizer, max_len=100):
        self.data = pd.read_csv(filepath)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        apology = "Apologetic: " + self.data.iloc[index]['Apologetic_Variant_Formal']
        formal = self.data.iloc[index]['Formal']

        source = self.tokenizer(apology, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        target = self.tokenizer(formal, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }


In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)


2025-04-24 17:08:22.128096: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745514502.149745 1445714 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745514502.156650 1445714 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745514502.174202 1445714 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745514502.174216 1445714 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745514502.174218 1445714 computation_placer.cc:177] computation placer alr

In [14]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(dataloader)




In [16]:
def evaluate_loss(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)


In [15]:
train_dataset = EmailToneDataset("train_formal_ap.csv", tokenizer)
val_dataset = EmailToneDataset("val_formal_ap.csv", tokenizer)
test_dataset = EmailToneDataset("test_formal_ap.csv", tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)


In [18]:
import os
import pandas as pd
import matplotlib.pyplot as plt

train_losses = []
val_losses = []

# Early stopping parameters
best_val_loss = float('inf')
patience = 3
counter = 0

for epoch in range(50):  # You can increase this if needed
    train_loss = train_epoch(model, train_loader)
    val_loss = evaluate_loss(model, val_loader)

    train_losses.append({"epoch": epoch + 1, "loss": train_loss})
    val_losses.append({"epoch": epoch + 1, "loss": val_loss})

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        # Optionally save the best model
        torch.save(model.state_dict(), "outputs3/best_model.pt")
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

# Save to CSV
os.makedirs("outputs3", exist_ok=True)
pd.DataFrame(train_losses).to_csv("outputs3/train_loss.csv", index=False)
pd.DataFrame(val_losses).to_csv("outputs3/val_loss.csv", index=False)

# Plot both
plt.plot([d['epoch'] for d in train_losses], [d['loss'] for d in train_losses], label='Train Loss')
plt.plot([d['epoch'] for d in val_losses], [d['loss'] for d in val_losses], label='Val Loss')
plt.title("Training vs Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.savefig("outputs3/loss_comparison.png")
plt.show()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 