In [62]:
import pandas as pd
from collections import namedtuple

import torch
import transformers
from typing import List, Dict
from transformers import AutoTokenizer

from torch.utils.data import DataLoader
from torch.utils.data import Dataset


In [None]:
import pandas as pd

## Pytorch Datasets & DataLoaders

In [None]:
class TextClassificationDataset(Dataset):

    def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        mappings: Dict[str, int] = None,
        max_seq_len: int = 150,
        model_name: str = 'distilbert-base-uncased'
    ):
        self.texts = texts
        self.labels = labels
        self.max_seq_len = max_seq_len
        self.model_name = model_name

        if (not mappings) and labels:
            unique_sorted_labels = sorted(set(labels))
            self.mappings = dict(
                zip(unique_sorted_labels, range(len(unique_sorted_labels))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, index):

        text = self.texts[index]
        # dict of (input_ids, )
        tokenized_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_seq_len,
            return_tensors='pt',
            truncation=True,
            return_attention_mask=True)

        if self.labels:
            label = self.labels[index]
            tokenized_dict['targets'] = (torch.Tensor([self.mappings.get(label, -1)])
                                         .long().squeeze(0))
        return tokenized_dict


In [5]:
def read_data(train, test):

    train_dataset = TextClassificationDataset(
        train['text'].iloc[:1000].values.tolist(),
        train['label'].iloc[:1000].values.tolist()
    )

    test_dataset = TextClassificationDataset(
        test['text'].iloc[:1000].values.tolist()
    )

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    DataLoaders = namedtuple('DataLoaders', ['train', 'test'])
    dataloaders = DataLoaders(train_loader, test_loader)
    return dataloaders


In [6]:
TRAIN_PATH = './data/train.csv'
TEST_PATH = './data/test.csv'

In [9]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)