<a href="https://colab.research.google.com/github/PavlosPo/nlp-optimizers/blob/pavlos-playground/pytorch-experiments-0-fosi-adam/playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## FOSI Classes

In [1]:
# !unzip fosi/fosi.zip -d ./fosi/

## Working Example

In [27]:
!pip install torchopt
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [28]:
import torch
import torchvision.transforms as transforms
from torch.utils.data.dataloader import default_collate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from torch.optim import Adam
import torch.nn as nn
import torchopt
import functorch
import evaluate

from datasets import load_dataset
from fosi import fosi_adam_torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained DistilBERT model and tokenizer
base_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:

# Define a function to preprocess the dataset
def prepare_dataset(example):
    return tokenizer(example['sentence'], truncation=True, padding="max_length", return_tensors='pt')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

# Define a sample dataset (replace this with your custom dataset)
# Example: IMDB movie review dataset
dataset = load_dataset('glue', 'cola').map(prepare_dataset, batched=True)

# Split dataset into train and test sets
train_dataset = dataset['train'].select(range(32)).remove_columns(['sentence', 'idx']).rename_column('label', 'labels')

test_dataset = dataset['test'].select(range(32)).remove_columns(['sentence', 'idx']).rename_column('label', 'labels')


In [30]:
test_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 32
})

In [31]:
# Define data loaders
batch_size = 8
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()

In [32]:
# def loss_fn(params, batch):
#     preds = model(params, batch['input_ids'], batch['attention_mask']).logits
#     loss = nn.CrossEntropyLoss()(preds, batch)
#     return loss

def loss_fn(params, buffers, input_ids, attention_mask, labels):
    logits = model(params, buffers=buffers, input_ids=input_ids, attention_mask=attention_mask).logits
    loss = nn.CrossEntropyLoss()(logits, labels)
    return loss

In [33]:
def accuracy(params, buffers, input_ids, attention_mask, labels):
    preds = model(params, buffers=buffers, input_ids=input_ids, attention_mask=attention_mask).logits
    acc = evaluate.load('accuracy').compute(predictions=torch.argmax(preds, dim=1).cpu().numpy(), references=labels.cpu().numpy())
    return acc

In [35]:


# Train the model
for epoch in range(2):

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):

        if i == 0: # Initialize optimizer and model parameters
            print(f"input_ids: {data['input_ids']}")
            print(f"attention_mask: {data['attention_mask']}")
            print(f"labels: {data['labels']}")
            # Define optimizer
            base_optimizer = torchopt.adam(lr=0.001)
            optimizer = fosi_adam_torch(base_optimizer, loss_fn, next(iter(trainloader)), num_iters_to_approx_eigs=500, alpha=0.01)
            model, params, buffers = functorch.make_functional_with_buffers(model=base_model)
            opt_state = optimizer.init(params)
            model.train()

        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        label = data['labels']
        # optimizer.zero_grad()
        # outputs = model(input_ids, attention_mask=attention_mask, labels=label)

        loss = loss_fn(params=params, buffers=buffers, input_ids=input_ids, attention_mask=attention_mask, labels=label)

        print(f"Calculating Gradients\n")
        grads = torch.autograd.grad(loss, params)
        print(f"Grads: \n{grads}\n")
        print("\n")
        print("*"*100)
        print("\n")
        print(f"Calculating updates in the model...\n")
        updates, opt_state = optimizer.update(grads, opt_state, params)
        print("Finding the updates of the model finished...\n")
        print("Applying updating\n")
        params = torchopt.apply_updates(params, updates, inplace=True)

        # print statistics
        running_loss += loss.item()
        if (i + 1) % 100 == 0:
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

    acc = 0.0
    num_samples = 0
    for i, batch in enumerate(testloader, 0):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Convert labels to PyTorch tensor if not already

        acc += accuracy(params, batch, input_ids, attention_mask, labels)
        num_samples += batch[0].shape[0]

    acc /= num_samples
    print(f'Test accuracy: {acc}')

print('Finished Training')


input_ids: tensor([[  101,  4302, 19055,  ...,     0,     0,     0],
        [  101,  1996,  2934,  ...,     0,     0,     0],
        [  101,  3021,  2628,  ...,     0,     0,     0],
        ...,
        [  101,  2057,  7581,  ...,     0,     0,     0],
        [  101, 11458, 25756,  ...,     0,     0,     0],
        [  101,  3021, 19055,  ...,     0,     0,     0]])
attention_mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
labels: tensor([1, 1, 1, 1, 1, 0, 1, 1])
Returned ESE function. Lanczos order (m) is 20 .


  warn_deprecated('make_functional_with_buffers', 'torch.func.functional_call')


[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m
        -2.6962e-05, -3.2949e-04]), tensor([[ 1.9776e-04, -7.6905e-04, -1.5629e-04,  ...,  1.2161e-04,
          1.9258e-05,  6.5751e-05],
        [-3.4516e-05,  9.3220e-05,  5.3450e-05,  ...,  3.2381e-05,
         -2.5515e-06, -2.5489e-05],
        [ 2.8595e-04, -5.8081e-04, -7.5039e-05,  ..., -4.4617e-04,
          1.8227e-07,  1.8730e-06],
        ...,
        [ 3.1853e-04, -6.2148e-04, -9.2663e-05,  ..., -3.0807e-04,
          2.4380e-05,  4.3514e-05],
        [ 5.4429e-05, -9.6927e-04, -1.1966e-04,  ..., -7.3853e-04,
          8.0089e-06,  2.3560e-05],
        [-7.4608e-05,  1.4134e-04,  1.0619e-04,  ...,  1.2843e-03,
         -5.8149e-06, -3.8163e-05]]), tensor([-6.0072e-04,  1.0217e-05, -3.9388e-04,  4.8017e-04, -3.8519e-04,
         1.7608e-03, -2.6975e-04,  1.5352e-03, -1.1545e-03, -1.0285e-03,
         5.6630e-04, -4.9453e-04,  5.2809e-04,  2.4990e-04,  5.7690e-04,
         7.9925e-04,  5.8597e-04,  2.4451e-

TypeError: labels is not an instance of torch.Tensor

In [None]:
# # Define training loop
# for epoch in range(3):  # Adjust number of epochs as needed
#     model.train()
#     for i, batch in enumerate(train_dataset):
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         labels = batch['label']

#         #This is taking care automatically
#         # optimizer.zero_grad() This is taking care automatically

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         # if (i + 1) % 100 == 0:
#         #     print(f'[{epoch + 1}, {i + 1:5d}] loss: {loss.item():.3f}')

#     # Evaluate on test set
#     model.eval()
#     total = 0
#     correct = 0
#     with torch.no_grad():
#         for batch in test_dataset:
#             input_ids = batch['input_ids']
#             attention_mask = batch['attention_mask']
#             labels = batch['label']
#             print("Labels: \n", labels)

#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             logits = outputs.logits
#             _, predicted = torch.max(logits, 1)
#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()

#     accuracy = correct / total
#     print(f'Test accuracy: {accuracy:.4f}')

# print('Finished Training')
