In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


model = AutoModelForSequenceClassification.from_pretrained("/scratch/general/vast/u1427155/cs6966/assignment1/models/microsoft/deberta-v3-base-finetuned-imdb/checkpoint-12500", num_labels=2)


In [4]:
model_path = "/scratch/general/vast/u1427155/cs6966/assignment1/models/microsoft/deberta-v3-base-finetuned-imdb/checkpoint-12500"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

In [10]:
dataset = load_dataset('imdb')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [14]:
sentence1_key, sentence2_key = ('text', None)
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 25000/25000 [00:11<00:00, 2166.75 examples/s]
Map: 100%|██████████| 25000/25000 [00:11<00:00, 2202.18 examples/s]
Map: 100%|██████████| 50000/50000 [00:23<00:00, 2134.77 examples/s]


In [15]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [24]:
for i, data in enumerate(encoded_dataset['test']):
    tokenized_text = tokenizer(data['text'],
                                truncation=True,
                                is_split_into_words=False,
                                return_tensors='pt')
    print(tokenized_text)
    print(data['input_ids'])
    outputs = model(tokenized_text["input_ids"])
    predicted_label = outputs.logits.argmax(-1)
    print(predicted_label, data['label'])
    if i == 10:
        break

{'input_ids': tensor([[     1,    273,    472,  14371,    271,   5863,    263,    481,   2608,
            264,    552,    322,    275,    266,    509,    260,  11373,    271,
           5863,   2883,    320,   6578,    281,   1048,  72193,    261,    494,
            271,  74895,    263,  22335,    260,    273,   1367,    264,    334,
            291,    261,    273,    431,    464,    261,    304,    278,    269,
            264,    397,   1341,  14371,    271,   5863,    283,  24034,    456,
            269,    264,   2445,  11430,    287,    724,   1020,    285,    260,
          44312,  71627,    261,   2315,  12925,   2428,    261,  90848,  37018,
            261,  19936,    272,    702,    280,    297,   1511,    262,   2008,
            261,    263,  25128,    311,    271,  10074,   1855,   1037,    282,
           5013,    275,    266,    382,  27086,    271,   5863,    280,   1680,
            260,    287,    476,    280,    358,    521,    343,    281,    421,
            26

KeyboardInterrupt: 

In [None]:
tokenized_text = tokenizer(data['text'],
                                truncation=True,
                                is_split_into_words=False,
                                return_tensors='pt')