In [35]:
import torch
import pathlib
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [36]:
base_model_name = 'distilbert-base-uncased'


def preprocess_imdb(dataset_dir):
    cur_dir = pathlib.Path(dataset_dir)
    texts = []
    sentiments = []
    stars = []
    for sentiment_dir in ['pos', 'neg']:
        for text_file in (cur_dir/sentiment_dir).iterdir():
            texts.append(text_file.read_text(encoding='utf-8'))
            sentiments.append(1 if sentiment_dir == 'pos' else 0)
            stars.append(int(text_file.name[:-4].split('_')[1]))
    
    return texts, sentiments, stars

texts_train, sentiments_train, stars_train = preprocess_imdb('aclImdb/train')
texts_test, sentiments_test, stars_test = preprocess_imdb('aclImdb/test')
    

In [37]:
class ImdbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

texts_train, texts_val, sentiments_train, sentiments_val = train_test_split(texts_train, sentiments_train, test_size=0.2)

train_encodings = tokenizer(texts_train, truncation=True, padding=True)
val_encodings = tokenizer(texts_val, truncation=True, padding=True)
test_encodings = tokenizer(texts_test, truncation=True, padding=True)

train_dataset = ImdbDataset(train_encodings, sentiments_train)
val_dataset = ImdbDataset(val_encodings, sentiments_val)
test_dataset = ImdbDataset(test_encodings, sentiments_test)

training_args = TrainingArguments(
    output_dir='./out',
    num_train_epochs=2,
    fp16=True,
    learning_rate = 1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=500
)

device = 'cuda'

model = AutoModelForSequenceClassification.from_pretrained(base_model_name).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()



loading configuration file config.json from cache at C:\Users\RedBeam/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\RedBeam/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\vocab.txt
loading file tokenizer.json from cache at C:\Users\RedBeam/.cache\huggingface\hub\model

RuntimeError: Found dtype Long but expected Float

In [8]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 4000
  Batch size = 4


{'eval_loss': 0.3215576708316803,
 'eval_runtime': 17.589,
 'eval_samples_per_second': 227.415,
 'eval_steps_per_second': 56.854,
 'epoch': 2.0}

In [32]:
text = "I'm disapointed"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [33]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4580, -1.9861]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [34]:
sigmoid = torch.nn.Sigmoid()
logits = outputs.logits
probs = sigmoid(logits.squeeze().cpu())
print(probs)

tensor([0.8112, 0.1207], grad_fn=<SigmoidBackward0>)
