In [47]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import lightning as L
import numpy as np
from torch.utils.data import DataLoader
import torch.utils.data as data
from torchvision import datasets
from transformers import BertModel, BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW

from datasets import load_dataset, load_metric, concatenate_datasets
from collections import OrderedDict


In [14]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        # self.example_input_array = torch.Tensor(32, 1, 28, 28)
        # self.l1 = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
        self.l1 = model

    def forward(self, input_ids, attention_mask, token_type_ids):
        h, _, attn = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        h_cls = h[:, 0]
        logits = self.W(h_cls)
        return logits, attn


class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        # self.example_input_array = torch.Tensor(32, 1, 28, 28)
        # self.l1 = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
        # create me a MLP that takes the logits from bert and train a simple layer
        self.l1 = MLP(3, 64, 28 * 28)

    def forward(self, x):
        return self.l1(x)


class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
        

    

class LitAutoEncoder(L.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        # self.example_input_array = torch.Tensor(32, 1, 28, 28)
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        loss, _ = self.encoder(
                input_ids = input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                # label=labels
                )

        tqdm_dict = {"train_loss": loss}
        output = OrderedDict({
            "loss": loss,
            "progress_bar": tqdm_dict,
            "log": tqdm_dict
            })

        return output
    
    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]

        loss, logits = self.encoder(
                input_ids = input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                # label=labels
                )
        labels_hat = torch.argmax(logits, dim=1)

        correct_count = torch.sum(labels == labels_hat)

        if self.on_gpu:
            correct_count = correct_count.cuda(loss.device.index)

        output = OrderedDict({
            "val_loss": loss,
            "correct_count": correct_count,
            "batch_size": len(labels)
            })
        return output

    def configure_optimizers(self):
        param_optimizer = list(self.encoder.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                    "weight_decay_rate": 0.01
                    },
                {
                    "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                    "weight_decay_rate": 0.0
                    },
                ]
        optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=2e-5,
                )
        return optimizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


In [9]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [10]:
dataset = load_dataset("glue", "cola")
metric = load_metric('glue', "cola")

dataset1 = concatenate_datasets([dataset["train"], dataset["validation"]])

# SPLIT DATA (seed = 1)
s = 1
dataset2 = dataset1.train_test_split(test_size=0.1666666666666, stratify_by_column='label')

train = dataset2["train"]
valid = dataset2["test"].train_test_split(test_size=0.5, stratify_by_column='label')["train"]
test = dataset2["test"].train_test_split(test_size=0.5, stratify_by_column='label')["test"]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [11]:
print(train)

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 7995
})


In [27]:

encoded_train = train.map(preprocess_function, batched=True)
encoded_valid = valid.map(preprocess_function, batched=True)
encoded_test = test.map(preprocess_function, batched=True)

In [28]:
print(encoded_train)

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7995
})


In [61]:
# model
model = LitAutoEncoder(Encoder(), Decoder())

# train model
# train with both splits
trainer = L.Trainer()

trainer.fit(model, encoded_train, encoded_valid)


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs



  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 109 M 
1 | decoder | Decoder | 51.2 K
------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
436.528   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

TypeError: _forward_unimplemented() got an unexpected keyword argument 'input_ids'