In [11]:
from datasets import load_dataset
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv('data_preprocess/datasets_combine.csv')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
data

Unnamed: 0,sentence,label
0,Claims she suffered catalogue of abuse at hand...,0
1,Six crew and 158 passengers evacuated from Ame...,0
2,ISABELLA:O just but severe law!I had a brother...,1
3,"Prosecutors say the two claimed $340,000 inten...",0
4,Moyes was sacked by Premier League club in Apr...,0
...,...,...
15769,YORK:'Twas by rebellion against his king,1
15770,"Second Conspirator:Most noble sir,If you do ho...",1
15771,FRIAR THOMAS:May your grace speak of it?DUKE V...,1
15772,Arsenal threw away a 3-0 loss to draw 3-3 with...,0


In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=False,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)}
        

In [13]:
# Initialize T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
VOCAB_SIZE = tokenizer.vocab_size

# Prepare the dataset and dataloader
train_texts, val_texts, train_labels, val_labels = train_test_split(data['sentence'], data['label'], test_size=0.2)
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length=128)
val_dataset = TextDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
for epoch in range(10):  # Number of epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)