In [1]:
import pandas as pd
import typing
import kaggle
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW

import logging
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Retrieve the Hewlett essay dataset from Kaggle using the Kaggle API
def download_kaggle_dataset(kaggle_dataset: str ="asap-aes") -> None:
    api = kaggle.api
    print(api.get_config_value('username'))
    kaggle.api.dataset_download_files(kaggle_dataset, path="./data", unzip=True)
    
# load and preprocess the Hewlett essay dataset from the data folder    
def load_and_process(local_dataset_path: str = "./data/training_set_rel3.tsv") -> list: 
    # Load the dataset in to a pandas dataframe
    data = pd.read_csv(local_dataset_path, sep='\t', encoding='ISO-8859-1')

    # Create a new training dataset based on this data
    #train_dataset = []
    input_ids = []
    attention_masks = []
    target_ids = []

    # Retrieve the pre-trained BART tokenizer|
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

    for _, row in data.iterrows():
        essay = row['essay']
        score = row['domain1_score']
        summary = f"This essay scored {score} points."  # Create a summary using the score

        # Preprocess the essay and convert to summary format
        # You may need to modify this based on the specifics of the dataset
        encoded = tokenizer.encode_plus(
            essay,
            summary,
            max_length=1024,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(encoded['input_ids'].squeeze())
        attention_masks.append(encoded['attention_mask'].squeeze())
        target_ids.append(encoded['input_ids'].squeeze())  # use the same input as the target (auto-regressive)
        
    tokenized_training_tensors = {
        'input_ids': torch.cat(input_ids, dim=0),
        'attention_masks': torch.cat(attention_masks, dim=0),
        'target_ids': torch.cat(target_ids, dim=0)
    }
        #train_dataset.append((input_ids, attention_mask, summary))
        
    return tokenized_training_tensors

In [14]:
def create_dataloader(tokenized_training_tensors: dict = None) -> typing.Any:
    batch_size = 4  # Adjust this based on your available resources
    train_data = torch.utils.data.TensorDataset(
        tokenized_training_tensors['input_ids'],
        tokenized_training_tensors['attention_masks'],
        tokenized_training_tensors['target_ids']
    )
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    return train_loader

def fine_tune_bart(model: typing.Any, train_loader: typing.Any, epochs: int = 5) -> None:

    # Step 6: Fine-tune the BART model
    # Note: In this example, we only train for a few epochs, but you can adjust the number of epochs and other hyperparameters based on your needs

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=1e-5)

    num_epochs = epochs
    
    for epoch in range(num_epochs):
        total_loss = 0

        for batch in train_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, target_ids = batch

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_masks,
                decoder_input_ids=target_ids[:-1],  # Exclude the last token from the target
                labels=target_ids[1:]  # Shift the target to the right (auto-regressive)
            )

            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [15]:
# If data present, read it in, otherwise, download it 
file_path = './data/training_set_rel3.tsv'
if os.path.exists(file_path):
    logging.info('Dataset found.')
else:
    logging.info('Dataset not found, downloading ...')
    download_kaggle_dataset()  

INFO:root:Dataset found.


In [16]:
logging.info('Reading dataset into pandas dataframe.')
tokenized_training_tensors = load_and_process(file_path)

INFO:root:Reading dataset into pandas dataframe.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned f

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [17]:
data_loader = create_dataloader(tokenized_training_tensors)

In [18]:
# Step 1: Load the pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

fine_tune_bart(model, data_loader)



ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
tokenized_training_tensors['input_ids'].size(0)

In [None]:
tokenized_training_tensors['attention_masks'].size(0)

In [None]:
tokenized_training_tensors['target_ids'].size(0)

In [33]:
tokenized_training_tensors['target_ids'].squeeze()[:,1:]

IndexError: too many indices for tensor of dimension 1

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for batch in data_loader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_masks, target_ids = batch

KeyboardInterrupt: 