In [21]:
import os
import pretty_errors
import pandas as pd
from typing import Optional
import torch
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import RobertaTokenizerFast
from transformers.models.auto.tokenization_auto import AutoTokenizer
from sklearn.model_selection import train_test_split
from utils import TYPES, TYPES_DICT

In [29]:
class BaseDataset(Dataset):
    def __init__(
        self,
        X_y,
        tokenizer_path: Optional[str] = None,
    ):
        self.X, self.y = X_y
        self.tokenizer = RobertaTokenizerFast.from_pretrained(
            tokenizer_path
        )

    def get_label_array(self, idx):
        return [TYPES_DICT[col][self.y.at[ idx, col]] for col in TYPES]

    def __getitem__(self, idx):
        text = self.X[idx]
        label = self.get_label_array(idx)
        text = self.tokenizer.encode_plus(
            text,
            # add_special_tokens=True,
            max_length=512,
            padding="max_length",
            # return_tensors="pt",
            return_attention_mask=False,
            truncation=True,
        )
        return {
            "ids": torch.tensor(text["input_ids"], dtype=torch.long),
            # "mask": torch.tensor(text["attention_mask"], dtype=torch.long),
            # "labels": torch.tensor(int(label), dtype=torch.float),
        }

    def __len__(self):
        return len(self.y)




In [30]:
class DataModule(LightningDataModule):
    def __init__(
        self,
        data_path,
        num_workers: int = 1,
        batch_size: int = 1,
        model_name: str = None,
        tokenizer_path: str = None,
    ):
        super().__init__()

        self.num_workers = num_workers
        self.batch_size = batch_size
        self.model_name = model_name
        self.tokenizer_path = tokenizer_path
        self.data = pd.read_csv(data_path)

    def setup(self, stage: Optional[str] = None) -> None:
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data['posts'], self.data[TYPES], stratify=self.data[TYPES])
        self.train_dataset = BaseDataset((self.X_train.values, self.y_train), self.tokenizer_path)
        self.test_dataset = BaseDataset((self.X_test.values, self.y_test), self.tokenizer_path)


    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
        )

In [31]:
dm = DataModule(data_path="mbti_processed.csv", tokenizer_path="models/tokenizer")

In [32]:
dm.setup()

file models/tokenizer/config.json not found
file models/tokenizer/config.json not found
file models/tokenizer/config.json not found
file models/tokenizer/config.json not found


In [33]:
dl = dm.val_dataloader()

In [34]:
for batch in dl:
    print(batch)
    break

{'ids': tensor([[   0,   59, 4111,    5,  225,   45,  225, 1098,  225, 1147,  225, 1043,
          225,  962,  225, 2831,   18,  225,   38,   73, 2219, 2094,  225, 2981,
          225,  959,  225, 1602,    5,  225,   30,   40,    2,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,

In [23]:
data_path = "mbti_processed.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,user_id,[I]ntro/[E]xtra-vert,I[n]tuition/[S]ensing,[T]hink/[F]eel,[J]udge/[P]erceive,type,posts
0,0,I,N,F,J,INFJ,<URL>
1,0,I,N,F,J,INFJ,<URL>
2,0,I,N,F,J,INFJ,enfp and intj moments <URL> sportscenter not t...
3,0,I,N,F,J,INFJ,What has been the most life-changing experienc...
4,0,I,N,F,J,INFJ,<URL> <URL> On repeat for most of today.


In [24]:
X_train, X_test, y_train, y_test = train_test_split(data['posts'] ,data[TYPES], stratify=data[TYPES])

In [25]:
y_train=y_train.reset_index(drop=True)

In [26]:
y_train

Unnamed: 0,[I]ntro/[E]xtra-vert,I[n]tuition/[S]ensing,[T]hink/[F]eel,[J]udge/[P]erceive
0,E,N,T,P
1,I,N,T,P
2,I,N,T,P
3,I,N,F,J
4,I,N,T,J
...,...,...,...,...
305646,I,N,F,J
305647,I,N,F,J
305648,I,N,F,J
305649,I,N,F,P


In [None]:
for col in y_train.columns:
    print(TYPES_DICT[col][y_train.at[ 0, col]])

In [None]:
for x in range(len(y_train)):
    label = [TYPES_DICT[col][y_train.at[ 0, col]] for col in y_train.columns]

In [None]:
for x in range(len(y_train)):
    label = [TYPES_DICT[col][y_train.at[ 0, col]] for col in y_train.columns]

In [27]:
y_train.at[ 2, "[I]ntro/[E]xtra-vert"]

'I'

In [28]:
y_train.iloc[ 2,0]

'I'