In [1]:
import pandas as pd
import typing
import kaggle
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer, 
    BartForConditionalGeneration, 
    AdamW, 
    BertForSequenceClassification, 
    BertTokenizer,
    AutoModelForSequenceClassification
)

import logging
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.read_csv("./data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')

In [23]:
data['label'] = np.digitize(data['domain1_score'].sort_values(ascending=False), bins=[20,40,61])

In [24]:
# Create lists to store the tokenized input and target tensors
input_ids = []
attention_masks = []
labels = []

# Retrieve the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

for _, row in data.iterrows():
    essay = row['essay']
    label = row['label']

    # Tokenize the essay
    encoded = tokenizer.encode_plus(
        essay,
        add_special_tokens=True,
        max_length=512,  # Adjust the max length as needed
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Append the tokenized input, attention mask, and label tensors
    input_ids.append(encoded['input_ids'].squeeze())
    attention_masks.append(encoded['attention_mask'].squeeze())
    #labels.append(torch.tensor(score).unsqueeze(0))
    labels.append(torch.tensor(label))

    #labels.append(torch.tensor(label))

# Convert the lists to tensors
tokenized_training_tensors = {
    'input_ids': torch.stack(input_ids),
    'attention_masks': torch.stack(attention_masks),
    'labels': torch.stack(labels)
}

In [30]:
print(f'''
{tokenized_training_tensors['input_ids'].shape}
{tokenized_training_tensors['attention_masks'].shape}
{tokenized_training_tensors['labels'].shape}
''')


torch.Size([12976, 512])
torch.Size([12976, 512])
torch.Size([12976])



In [31]:
train_data = torch.utils.data.TensorDataset(
    tokenized_training_tensors['input_ids'],
    tokenized_training_tensors['attention_masks'],
    tokenized_training_tensors['labels']
)
train_loader = DataLoader(train_data)

In [36]:
data['label'].value_counts()

0    11877
1      792
2      307
Name: label, dtype: int64

In [34]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Freeze all the parameters of the pre-trained BERT model
for param in model.parameters():
    param.requires_grad = False

# Modify the classifier layer for the new task
logging.info(f"Model (before new layer): {model}")
model.classifier = nn.Linear(model.config.hidden_size, 3)
model.train()
logging.info(f"Model: {model}")

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0

    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch
        logging.info(f"inputs_id size: {input_ids.shape}") 
        logging.info(f"labels size: {labels.shape}")            


        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_masks,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    logging.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

RuntimeError: shape '[-1, 2]' is invalid for input of size 3

In [2]:
BATCH_SIZE = 16

# Retrieve the Hewlett essay dataset from Kaggle using the Kaggle API
def download_kaggle_dataset(kaggle_dataset: str ="asap-aes") -> None:
    api = kaggle.api
    print(api.get_config_value('username'))
    kaggle.api.dataset_download_files(kaggle_dataset, path="./data", unzip=True)
    
# load and preprocess the Hewlett essay dataset from the data folder    
def load_and_process(local_dataset_path: str = "./data/training_set_rel3.tsv") -> list: 
    # Load the dataset into a pandas dataframe
    data = pd.read_csv(local_dataset_path, sep='\t', encoding='ISO-8859-1')

    # Create lists to store the tokenized input and target tensors
    input_ids = []
    attention_masks = []
    labels = []

    # Retrieve the pre-trained BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    for _, row in data.iterrows():
        essay = row['essay']
        score = row['rater1_domain1']#row['domain1_score']
        label = int(score)  # Convert the score to an integer label

        # Tokenize the essay
        encoded = tokenizer.encode_plus(
            essay,
            add_special_tokens=True,
            max_length=512,  # Adjust the max length as needed
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Append the tokenized input, attention mask, and label tensors
        input_ids.append(encoded['input_ids'].squeeze())
        attention_masks.append(encoded['attention_mask'].squeeze())
        #labels.append(torch.tensor(score).unsqueeze(0))
        labels.append(torch.tensor(label))

        #labels.append(torch.tensor(label))

    # Convert the lists to tensors
    tokenized_training_tensors = {
        'input_ids': torch.stack(input_ids),
        'attention_masks': torch.stack(attention_masks),
        'labels': torch.stack(labels)
    }
    return tokenized_training_tensors


def create_dataloader(tokenized_training_tensors: dict = None, batch_size: int = BATCH_SIZE, shuffle: bool = True) -> DataLoader:
    train_data = torch.utils.data.TensorDataset(
        tokenized_training_tensors['input_ids'],
        tokenized_training_tensors['attention_masks'],
        tokenized_training_tensors['labels']
    )
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    return train_loader


def transfer_learning_bert(
    model: typing.Any, 
    train_loader: typing.Any, 
    num_labels: int, 
    epochs: int = 1
) -> None:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Freeze all the parameters of the pre-trained BERT model
    for param in model.parameters():
        param.requires_grad = False
    
    # Modify the classifier layer for the new task
    logging.info(f"Model (before new layer): {model}")
    model.classifier = nn.Linear(model.config.hidden_size, num_labels)
    model.train()
    logging.info(f"Model: {model}")
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    num_epochs = epochs
    
    for epoch in range(num_epochs):
        total_loss = 0
        
        for batch in train_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, labels = batch
            logging.info(f"inputs_id size: {input_ids.shape}") 
            logging.info(f"labels size: {labels.shape}")            

            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_masks,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
        logging.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    return model
    


In [3]:
#     # Load the dataset in to a pandas dataframe
#     data = pd.read_csv(local_dataset_path, sep='\t', encoding='ISO-8859-1')

#     # Create a new training dataset based on this data
#     #train_dataset = []
#     input_ids = []
#     attention_masks = []
#     target_ids = []

#     # Retrieve the pre-trained BART tokenizer|
#     tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

#     for _, row in data.iterrows():
#         essay = row['essay']
#         score = row['domain1_score']
#         summary = f"This essay scored {score} points."  # Create a summary using the score

#         # Preprocess the essay and convert to summary format
#         # You may need to modify this based on the specifics of the dataset
#         encoded = tokenizer.encode_plus(
#             essay,
#             summary,
#             max_length=1024,
#             padding='max_length',
#             truncation=True,
#             return_tensors='pt'
#         )

#         input_ids.append(encoded['input_ids'].squeeze())
#         attention_masks.append(encoded['attention_mask'].squeeze())
#         target_ids.append(encoded['input_ids'].squeeze())  # use the same input as the target (auto-regressive)
        
#     tokenized_training_tensors = {
#         'input_ids': torch.cat(input_ids, dim=0),
#         'attention_masks': torch.cat(attention_masks, dim=0),
#         'target_ids': torch.cat(target_ids, dim=0)
#     }
#         #train_dataset.append((input_ids, attention_mask, summary))
        
    #return tokenized_training_tensors
    
    
    #----
    
# def create_dataloader(tokenized_training_tensors: dict = None) -> typing.Any:
#     batch_size = 4  # Adjust this based on your available resources
#     train_data = torch.utils.data.TensorDataset(
#         tokenized_training_tensors['input_ids'],
#         tokenized_training_tensors['attention_masks'],
#         tokenized_training_tensors['target_ids']
#     )
#     train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
#     return train_loader



# def fine_tune_bart(model: typing.Any, train_loader: typing.Any, epochs: int = 5) -> None:

#     # Step 6: Fine-tune the BART model
#     # Note: In this example, we only train for a few epochs, but you can adjust the number of epochs and other hyperparameters based on your needs

#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     model.train()

#     optimizer = AdamW(model.parameters(), lr=1e-5)

#     num_epochs = epochs
    
#     for epoch in range(num_epochs):
#         total_loss = 0

#         for batch in train_loader:
#             batch = tuple(t.to(device) for t in batch)
#             input_ids, attention_masks, target_ids = batch

#             outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_masks,
#                 decoder_input_ids=target_ids[:-1],  # Exclude the last token from the target
#                 labels=target_ids[1:]  # Shift the target to the right (auto-regressive)
#             )

#             loss = outputs.loss
#             total_loss += loss.item()

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#         avg_loss = total_loss / len(train_loader)
#         print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
        
        


In [4]:
# If data present, read it in, otherwise, download it 
file_path = './data/training_set_rel3.tsv'
if os.path.exists(file_path):
    logging.info('Dataset found.')
else:
    logging.info('Dataset not found, downloading ...')
    download_kaggle_dataset()  

INFO:root:Dataset found.


In [5]:
logging.info('Reading dataset into pandas dataframe.')
tokenized_training_tensors = load_and_process(file_path)

INFO:root:Reading dataset into pandas dataframe.


In [6]:
data_loader = create_dataloader(tokenized_training_tensors)

In [7]:
#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased"')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# Assume you have a train_loader that provides input_ids, attention_masks, and labels
num_labels = 30 # Replace with the actual number of classes in the target column
model_transfer = transfer_learning_bert(model=model, train_loader=data_loader, num_labels=num_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

ValueError: Expected input batch_size (15) to match target batch_size (1).

In [None]:
# # Step 1: Load the pre-trained BART model
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# fine_tune_bart(model, data_loader)

# Example usage:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
data = pd.read_csv("./data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')

In [None]:
len(data['rater1_domain1'].value_counts())

In [None]:
n=0
for batch in data_loader:
    print(batch)
    n+=1

In [None]:
model.bert

In [None]:
input_ids