### Import statements

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

### Load dataset

In [2]:
dataset = load_dataset("imdb")
dataset.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

### Explore data

In [3]:
dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [4]:
dataset["train"][1]

{'text': '"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, 

### Tokenize the dataset

We'll use a pre-trained tokenizer(bert-base-uncased).

`bert-base-uncased` refers to a lowercase version of BERT (i.e., it converts all text to lowercase and doesn’t distinguish between capital letters).

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"],
                    padding='max_length',
                    truncation=True,
                    max_length=256)
    
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# 'input_ids' : (the token IDs),
# 'attention_mask' : (indicates which tokens are real vs. padding)
#  token_type_ids: Used in tasks like QA
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

### Format for Model Training

In [7]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [9]:
tokenized_datasets["train"][1]

{'labels': tensor(0),
 'input_ids': tensor([  101,  1000,  1045,  2572,  8025,  1024,  3756,  1000,  2003,  1037,
         15544, 19307,  1998,  3653,  6528, 20771, 19986,  8632,  1012,  2009,
          2987,  1005,  1056,  3043,  2054,  2028,  1005,  1055,  2576,  5328,
          2024,  2138,  2023,  2143,  2064,  6684,  2022,  2579,  5667,  2006,
          2151,  2504,  1012,  2004,  2005,  1996,  4366,  2008, 19124,  3287,
         16371, 25469,  2003,  2019,  6882, 13316,  1011,  2459,  1010,  2008,
          3475,  1005,  1056,  2995,  1012,  1045,  1005,  2310,  2464,  1054,
          1011,  6758,  3152,  2007,  3287, 16371, 25469,  1012,  4379,  1010,
          2027,  2069,  3749,  2070, 25085,  5328,  1010,  2021,  2073,  2024,
          1996,  1054,  1011,  6758,  3152,  2007, 21226, 24728, 22144,  2015,
          1998, 20916,  4691,  6845,  2401,  1029,  7880,  1010,  2138,  2027,
          2123,  1005,  1056,  4839,  1012,  1996,  2168,  3632,  2005,  2216,
         10231,  