In [2]:
#pip install datasets
#pip install scikit-learn
#pip install torch
#pip install numpy
#pip install pandas
#pip install transformers
#pip install torch torchvision
#pip install SentencePiece
#pip install ipywidgets
#pip install --upgrade pip setuptools wheel
#pip install wandb

In [3]:
pip install tdqm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [7]:
train_file = "/home1/kmayya/Pipeline/synthetic_dataset.csv"
df = pd.read_csv(train_file)

In [8]:
import sklearn
from sklearn.model_selection import train_test_split


X = df['dense_frame_descriptions']
y = df['description']

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.1,  random_state=42)

In [9]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [14]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.dense_frame_descriptions
        self.ctext = self.data.description

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [15]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in tqdm(enumerate(loader, 0)):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [16]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [18]:
import wandb
from tqdm import tqdm

def main():
    wandb.init(project="transformers_tutorials_summarization")

    config = wandb.config        
    config.TRAIN_BATCH_SIZE = 2    
    config.VALID_BATCH_SIZE = 2    
    config.TRAIN_EPOCHS = 2       
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    
    config.SEED = 42              
    config.MAX_LEN = 512
    config.SUMMARY_LEN = 150 

    torch.manual_seed(config.SEED) 
    np.random.seed(config.SEED) 
    torch.backends.cudnn.deterministic = True

    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
    df = pd.read_csv(train_file)
    df = df[['dense_frame_descriptions','description']]
    df.dense_frame_descriptions = 'summarize keyframe captions: ' + df.dense_frame_descriptions
    print(df.head())

    
    train_size = 0.8
    train_dataset=df.sample(frac=train_size,random_state = config.SEED)
    val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = model.to(device)

    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    wandb.watch(model, log="all")
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    model.eval()
    torch.save(model.state_dict(), 't5_on_synthetic.pth')

    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        print('Output Files generated for review')

if __name__ == '__main__':
    main()

                            dense_frame_descriptions  \
0  summarize keyframe captions: ['a large lawn in...   
1  summarize keyframe captions: ['a rodeo arena w...   
2  summarize keyframe captions: ['a group of youn...   
3  summarize keyframe captions: ["a close-up of a...   
4  summarize keyframe captions: ["a person's hand...   

                                         description  
0  A backyard is shown as a man sitting on an ele...  
1  A bull comes out of the gate at a rodeo and  m...  
2  We see people doing a dance routine in a baske...  
3  A man is standing outside in his front lawn wi...  
4  A towel and glass are on a table next to a pie...  
FULL Dataset: (1027, 2)
TRAIN Dataset: (822, 2)
TEST Dataset: (205, 2)
Initiating Fine-Tuning for the model on our dataset


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch: 0, Loss:  11.284489631652832


411it [00:56,  7.26it/s]
2it [00:00,  7.72it/s]

Epoch: 1, Loss:  1.495384931564331


411it [00:53,  7.64it/s]


Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Output Files generated for review


In [24]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")


inputs = tokenizer("summarize keyframe captions: a man in a kitchen, standing in front of a white countertop with a computer monitor on it. He is holding a chainsaw and appears to be in the process of cutting through a piece of wood. The kitchen has white cabinets and a tiled floor. There is a sink and a window in the background, and a blue curtain on the right side of the image. The man is wearing a gray shirt and is focused on the task at hand.", return_tensors="pt")
inputs.to(device)

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

model.load_state_dict(torch.load("/home1/kmayya/Pipeline/t5_on_synthetic.pth"))
model.eval()

outputs = model.generate(
    **inputs, 
    max_length=1500,  
    num_beams=5,    
    early_stopping=True
)

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Output:", decoded_output)

  model.load_state_dict(torch.load("/home1/kmayya/Pipeline/t5_on_synthetic.pth"))


Generated Output: keyframe captions: "a man in a kitchen cutting through a piece of wood. He is wearing a gray shirt and appears to be in the process of cutting through a piece of wood. He is holding a chainsaw and appears to be in the process of cutting through a piece of wood. The kitchen has white cabinets and a tiled floor.
