In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        inp, hyp, label = row['inp'], row['hyp'], row['label']
        prompt = f'PROMPT: {inp} \n\n PARTIAL RESPONSE: {hyp}'

        inputs = self.tokenizer.encode_plus(
            prompt,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

class T5BinaryClassifier(pl.LightningModule):
    def __init__(self, model_name, tokenizer, learning_rate, max_len=128):
        super().__init__()

        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
        self.max_len = max_len

    def forward(self, input_ids, attention_mask, labels=None):
        if labels is not None:
            return self.model(input_ids, attention_mask=attention_mask, labels=labels)
        else:
            return self.model.generate(input_ids, attention_mask=attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['label']
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['label']
        logits = self(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=-1)
        accuracy = (preds == labels).float().mean()
        self.log('val_accuracy', accuracy)
        return accuracy

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.learning_rate)

def balance_dataframe(dataframe):
    label_counts = dataframe['label'].value_counts()
    min_count = label_counts.min()
    balanced_data = dataframe.groupby('label').apply(lambda grp: grp.sample(min_count)).reset_index(drop=True)
    return balanced_data

def train_val_split(dataframe, test_size=0.2, random_state=42):
    unique_inp = dataframe['inp'].unique()
    train_inp, test_inp = train_test_split(unique_inp, test_size=test_size, random_state=random_state)
    
    train_df = dataframe[dataframe['inp'].isin(train_inp)].reset_index(drop=True)
    test_df = dataframe[dataframe['inp'].isin(test_inp)].reset_index(drop=True)
    
    return train_df, test_df

def train(dataframe, model_name='t5-small', epochs=5, batch_size=8, learning_rate=3e-5, max_len=128, val_interval=1):
    # Balance DataFrame and split into train and test
    dataframe = balance_dataframe(dataframe)
    train_df, test_df = train_val_split(dataframe)

    tokenizer = T5Tokenizer.from_pretrained(model_name)
    train_dataset = CustomDataset(train_df, tokenizer, max_len)
    test_dataset = CustomDataset(test_df, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_dataset, batch_size=batch_size)

    model = T5BinaryClassifier(model_name, tokenizer, learning_rate, max_len)
    trainer = pl.Trainer(
        max_epochs=epochs,
        # gpus=torch.cuda.device_count(),
        log_every_n_steps=val_interval,
        check_val_every_n_epoch=val_interval,
    )
    trainer.fit(model, train_loader, val_loader)

In [8]:
# Replace with your actual DataFrame
inpdf = pd.read_json("output/traingoodclassif.json", lines=True, orient="records")

In [11]:
inpdf['label'] = (inpdf['sco']>.85).astype(int)

In [16]:
# Train the model
train(inpdf, model_name='stanfordnlp/SteamSHP-flan-t5-large', epochs=5, batch_size=8, learning_rate=3e-5, val_interval=1)

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 4 processes
----------------------------------------------------------------------------------------------------

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

ProcessExitedException: process 3 terminated with signal SIGSEGV