In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [2]:
import json
from transformers import (
    Trainer,
    TrainingArguments,
    GPT2Tokenizer,
    GPT2ForSequenceClassification,
)
import torch


def read_json(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            recipe_data = json.load(file)
            data.extend(recipe_data)
    return data


def extract_data(data):
    dataset = []
    for entry in data:
        image_data = entry['objects']
        step = entry['step']
        formatted_entry = {'image_data': image_data, 'step': step}
        dataset.append(formatted_entry)
    return dataset


def prepare_inputs(dataset, tokenizer):
    texts = [f"Image Data: {entry['image_data']}" for entry in dataset]
    labels = [entry['step'] - 1 for entry in dataset]

    # Adding a padding token to the tokenizer
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # tokenizer.pad_token = '[PAD]'

    # Tokenize and ensure uniform sequence length
    max_length = 128  # Set an appropriate maximum length
    tokenized_inputs = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    return tokenized_inputs, labels


def prepare_dataset(tokenized_inputs, labels):
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, tokenized_inputs, labels):
            self.tokenized_inputs = tokenized_inputs
            self.labels = labels

        def __getitem__(self, idx):
            item = {
                'input_ids': torch.tensor(self.tokenized_inputs['input_ids'][idx]),
                'attention_mask': torch.tensor(
                    self.tokenized_inputs['attention_mask'][idx]
                ),
                'labels': torch.tensor(self.labels[idx]),
            }
            return item

        def __len__(self):
            return len(self.labels)

    return CustomDataset(tokenized_inputs, labels)


def train_model(model, train_dataset, training_args, tokenizer):
    # Create a custom collator to handle variable-length sequences
    def custom_collator(data):
        input_ids = [item['input_ids'] for item in data]
        attention_masks = [item['attention_mask'] for item in data]
        labels = [item['labels'] for item in data]

        inputs = tokenizer.pad(
            {"input_ids": input_ids, "attention_mask": attention_masks},
            padding='longest',
            return_tensors="pt",
        )

        return {
            'input_ids': inputs.input_ids,
            'attention_mask': inputs.attention_mask,
            'labels': torch.tensor(labels),
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=custom_collator,  # Use the custom collator
    )
    trainer.train()


def main():
    # file_paths = [
    #     'scripts/evaluations/resource/lang_train/data_quesadilla_2023.06.16-18.53.43.json',
    #     'scripts/evaluations/resource/lang_train/data_quesadilla_2023.06.16-18.57.48.json',
    # ]  # Add all file paths
    file_paths = ["/content/data_quesadilla_2023.06.16-18.53.43.json",
                  "/content/data_quesadilla_2023.06.16-18.57.48.json"]
    data = read_json(file_paths)
    dataset = extract_data(data)

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = '[PAD]'
    tokenized_inputs, labels = prepare_inputs(dataset, tokenizer)
    train_dataset = prepare_dataset(tokenized_inputs, labels)

    # Load model
    model = GPT2ForSequenceClassification.from_pretrained(
        'gpt2', pad_token_id=tokenizer.pad_token_id, num_labels=len(set(labels))
    )
    model.resize_token_embeddings(len(tokenizer))

    # Training arguments
    training_args = TrainingArguments(
        output_dir='results/',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=100,
        save_total_limit=2,
    )

    # Start training
    train_model(model, train_dataset, training_args, tokenizer)


if __name__ == "__main__":
    main()


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'input_ids': torch.tensor(self.tokenized_inputs['input_ids'][idx]),
  'attention_mask': torch.tensor(


Step,Training Loss


  'input_ids': torch.tensor(self.tokenized_inputs['input_ids'][idx]),
  'attention_mask': torch.tensor(


In [8]:
file_paths = ["/content/data_quesadilla_2023.06.16-18.53.43.json",
                  "/content/data_quesadilla_2023.06.16-18.57.48.json"]
data = read_json(file_paths)
dataset = extract_data(data)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
tokenized_inputs, labels = prepare_inputs(dataset, tokenizer)

train_dataset = prepare_dataset(tokenized_inputs, labels)

model = GPT2ForSequenceClassification.from_pretrained(
        '/content/results/checkpoint-100', pad_token_id=tokenizer.pad_token_id, num_labels=len(set(labels))
    )

In [12]:
import numpy as np

def compute_accuracy(predictions, labels):
    return np.sum(predictions == labels) / len(labels)

def get_predictions(model, data_loader):
    predictions = []
    true_labels = []
    for batch in data_loader:
        with torch.no_grad():
            outputs = model(**batch)
        predictions.extend(np.argmax(outputs.logits.cpu().numpy(), axis=1))
        true_labels.extend(batch['labels'].cpu().numpy())
    return predictions, true_labels

# Use DataLoader to iterate through the train_dataset in batches
data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)

# Get predictions and true labels
predictions, true_labels = get_predictions(model, data_loader)

# Calculate accuracy
accuracy = compute_accuracy(predictions, true_labels)
print(f"Training Accuracy: {accuracy * 100:.2f}%")

  'input_ids': torch.tensor(self.tokenized_inputs['input_ids'][idx]),
  'attention_mask': torch.tensor(


Training Accuracy: 0.00%


In [None]:
# idea: to give reward to increasing step number predictions with time
# cons: we don't know when the recipe has started


In [14]:
predictions

[6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 2,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 2,
 6,
 2,
 6,
 6,
 3,
 2,
 3,
 3,
 6,
 2,
 6,
 6,
 1,
 6,
 2,
 6,
 3,
 6,
 6,
 6,
 3,
 6,
 2,
 3,
 6,
 6,
 2,
 6,
 3,
 6,
 6,
 2,
 6,
 2,
 6,
 3,
 3,
 2,
 6,
 6,
 6,
 6,
 2,
 3,
 6,
 2,
 2,
 6,
 3,
 3,
 3,
 3,
 6,
 6,
 3,
 6,
 2,
 6,
 3,
 6,
 6,
 6,
 6,
 2,
 2,
 3,
 3,
 2,
 6,
 6,
 2,
 2,
 6,
 6,
 6,
 6,
 6,
 3,
 6,
 6,
 3,
 6,
 6,
 3,
 2,
 6,
 6,
 6,
 6,
 6,
 3,
 6,
 6,
 6,
 3,
 6,
 6,
 3,
 6,
 6,
 6,
 2,
 3,
 6,
 6,
 6,
 3,
 6,
 6,
 3,
 3,
 3,
 2,
 6,
 6,
 6,
 2,
 3,
 6,
 2,
 2,
 6,
 6,
 2,
 6,
 6,
 6,
 6,
 3,
 3,
 2,
 6,
 3,
 2,
 2,
 6,
 2,
 6,
 3,
 6,
 6,
 6,
 6,
 6,
 3,
 6,
 6,
 6,
 3,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 3,
 2,
 3,
 4,
 6,
 6,
 4,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 3,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,


In [15]:
true_labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
