In [1]:
import os
import pretty_errors
import pandas as pd
from typing import Optional
import torch
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import RobertaTokenizerFast
from transformers.models.auto.tokenization_auto import AutoTokenizer
from sklearn.model_selection import train_test_split
from utils import TYPES, TYPES_DICT
import math

import torch
from pytorch_lightning import LightningModule
from torch import Tensor
from torch.nn import Dropout, Linear, Module
from torch.nn.modules.loss import BCEWithLogitsLoss
from torch.nn.modules.sparse import Embedding
from torch.nn.modules.transformer import TransformerEncoder, TransformerEncoderLayer
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification

# from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.functional.classification.f_beta import f1_score

In [24]:

class BaseDataset(Dataset):
    def __init__(
        self,
        X_y,
        tokenizer_path: Optional[str] = None,
    ):
        super().__init__()
        self.X, self.y = X_y
        self.tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

    def get_label_array(self, idx):
        array = []
        for i, col in enumerate(TYPES):
            bin = self.y.iloc[idx, i]
            array.append(TYPES_DICT[col][bin])

        return array

    def __getitem__(self, idx):
        text = self.X[idx]
        label = self.get_label_array(idx)
        text_encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt",
            return_attention_mask=True,
            truncation=True,
        )
        return {
            "ids": text_encoded["input_ids"].squeeze(),
            "mask": text_encoded["attention_mask"],
            "labels": torch.tensor(label, dtype=torch.float),
        }

    def __len__(self):
        return len(self.y)

In [25]:
class DataModule(LightningDataModule):
    def __init__(
        self,
        data_path,
        tokenizer_path: str = None,
        num_workers: int = 1,
        batch_size: int = 4,
        model_name: str = None,
    ):
        super().__init__()

        self.num_workers = num_workers
        self.batch_size = batch_size
        self.model_name = model_name
        self.tokenizer_path = tokenizer_path
        self.data = pd.read_csv(data_path)

    def setup(self, stage: Optional[str] = None) -> None:
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.data["posts"], self.data[TYPES], stratify=self.data[TYPES]
        )
        self.train_dataset = BaseDataset(
            (self.X_train.values, self.y_train.reset_index(drop=True)),
            self.tokenizer_path,
        )
        self.test_dataset = BaseDataset(
            (self.X_test.values, self.y_test.reset_index(drop=True)),
            self.tokenizer_path,
        )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
        )

In [26]:
dm = DataModule(data_path="mbti_processed.csv", tokenizer_path="models/tokenizer")
dm.setup()
dl = dm.val_dataloader()

file models/tokenizer/config.json not found
file models/tokenizer/config.json not found
file models/tokenizer/config.json not found
file models/tokenizer/config.json not found


In [27]:
class AlbertTransformer(Module):
    def __init__(self, model, d_head) -> None:
        super().__init__()
        self.albert = AutoModelForSequenceClassification.from_pretrained(model)

    def forward(self, ids, mask) -> Tensor:
        out = self.albert.albert(input_ids=ids, attention_mask=mask)
        return out


In [12]:
model = AlbertTransformer(, 10)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [16]:
albert = AutoModelForSequenceClassification.from_pretrained("albert-base-v2")

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [20]:

roberta = AutoModelForSequenceClassification.from_pretrained("roberta-base")

Downloading: 100%|██████████| 481/481 [00:00<00:00, 239kB/s]
Downloading: 100%|██████████| 478M/478M [00:13<00:00, 38.2MB/s] 
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification we

In [28]:
for batch in dl:
    ids, mask, labels = batch["ids"], batch["mask"], batch["labels"]
    # print(batch)
    print(ids.shape)
    output = roberta(input_ids=ids,attention_mask=mask)
    break


torch.Size([4, 512])


In [None]:
mask