In [1]:
import torch
import torch.nn as nn
import pathlib
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np


In [5]:
base_model_name = 'bert-base-uncased'


def preprocess_imdb(dataset_dir):
    cur_dir = pathlib.Path(dataset_dir)
    texts = []
    sentiments = []
    stars = []
    for sentiment_dir in ['pos', 'neg']:
        for text_file in (cur_dir/sentiment_dir).iterdir():
            texts.append(text_file.read_text(encoding='utf-8'))
            sentiments.append(1 if sentiment_dir == 'pos' else 0)
            star_count = int(text_file.name[:-4].split('_')[1])
            stars.append(star_count - 1)
    
    return texts, sentiments, stars

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

device = 'cuda'

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy':  accuracy.compute(predictions=predictions, references=labels)}
    
tokenizer = AutoTokenizer.from_pretrained(base_model_name, fast=True)

loading configuration file config.json from cache at C:\Users\RedBeam/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64c8a40076\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\RedBeam/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64c8a40076\voc

In [4]:
texts_train, sentiments_train, stars_train = preprocess_imdb('aclImdb/train')
texts_test, sentiments_test, stars_test = preprocess_imdb('aclImdb/test')

texts_train, texts_val, sentiments_train, sentiments_val = train_test_split(texts_train, sentiments_train, test_size=0.1)

train_encodings = tokenizer(texts_train, truncation=True, padding="max_length", return_tensors="pt", max_length=256)
val_encodings = tokenizer(texts_val, truncation=True, padding="max_length", return_tensors="pt", max_length=256)
test_encodings = tokenizer(texts_test, truncation=True, padding="max_length", return_tensors="pt", max_length=256)


train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"],
                                   "attention_mask": train_encodings["attention_mask"],
                                   "labels": torch.tensor(sentiments_train)})
train_dataset.set_format("pt")
val_dataset = Dataset.from_dict({"input_ids": val_encodings["input_ids"],
                                 "attention_mask": val_encodings["attention_mask"],
                                 "labels": torch.tensor(sentiments_val)})
val_dataset.set_format("pt")
test_dataset = Dataset.from_dict({"input_ids": test_encodings["input_ids"],
                                  "attention_mask": test_encodings["attention_mask"],
                                  "labels": torch.tensor(sentiments_test)})
test_dataset.set_format("pt")

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, 
                                                           num_labels=2, 
                                                           id2label=id2label,
                                                           label2id=label2id).to("cuda")
model.train()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
training_args = TrainingArguments(
    output_dir="sentiment_model",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_steps=100,
    fp16=True,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)




Using cuda_amp half precision backend


In [7]:
trainer.train()

***** Running training *****
  Num examples = 22500
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1407
  Number of trainable parameters = 109483778
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
100,0.4473,0.281827,0.896
200,0.3167,0.265454,0.9028
300,0.3171,0.256723,0.8988
400,0.3012,0.282536,0.8932
500,0.29,0.233485,0.9084
600,0.2406,0.289502,0.9072
700,0.2432,0.25237,0.9048
800,0.2634,0.227445,0.9116
900,0.265,0.211365,0.9216
1000,0.2449,0.207746,0.9188


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
Saving model checkpoint to sentiment_model\checkpoint-500
Configuration saved in sentiment_model\checkpoint-500\config.json
Model weights saved in sentiment_model\checkpoint-500\pytorch_model.bin
tokenizer config file saved in sentiment_model\checkpoint-500\tokenizer_config.json
Special tokens file saved in sentiment_model\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 

TrainOutput(global_step=1407, training_loss=0.2756272058751283, metrics={'train_runtime': 394.3561, 'train_samples_per_second': 57.055, 'train_steps_per_second': 3.568, 'total_flos': 2959999372800000.0, 'train_loss': 0.2756272058751283, 'epoch': 1.0})

In [12]:
trainer.save_model("sentiment_model")

Saving model checkpoint to test_model
Configuration saved in test_model\config.json
Model weights saved in test_model\pytorch_model.bin
tokenizer config file saved in test_model\tokenizer_config.json
Special tokens file saved in test_model\special_tokens_map.json


In [3]:
texts_train, sentiments_train, stars_train = preprocess_imdb('aclImdb/train')
texts_test, sentiments_test, stars_test = preprocess_imdb('aclImdb/test')

texts_train, texts_val, stars_train, stars_val = train_test_split(texts_train, stars_train, test_size=0.1)

train_encodings = tokenizer(texts_train, truncation=True, padding="max_length", return_tensors="pt", max_length=256)
val_encodings = tokenizer(texts_val, truncation=True, padding="max_length", return_tensors="pt", max_length=256)
test_encodings = tokenizer(texts_test, truncation=True, padding="max_length", return_tensors="pt", max_length=256)


train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"],
                                   "attention_mask": train_encodings["attention_mask"],
                                   "labels": stars_train})
train_dataset.set_format("pt")
val_dataset = Dataset.from_dict({"input_ids": val_encodings["input_ids"],
                                 "attention_mask": val_encodings["attention_mask"],
                                 "labels": stars_val})
val_dataset.set_format("pt")
test_dataset = Dataset.from_dict({"input_ids": test_encodings["input_ids"],
                                  "attention_mask": test_encodings["attention_mask"]})
test_dataset.set_format("pt")


model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=10).to(device)
model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [6]:
training_args = TrainingArguments(
    output_dir="stars_model",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    logging_steps=100,
    fp16=True,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 22500
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1407
  Number of trainable parameters = 109489930
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
100,1.6057,1.616988,{'accuracy': 0.3728}
200,1.5431,1.507031,{'accuracy': 0.4132}
300,1.4939,1.455813,{'accuracy': 0.4324}
400,1.4841,1.419018,{'accuracy': 0.4388}
500,1.4137,1.385005,{'accuracy': 0.458}
600,1.4493,1.370575,{'accuracy': 0.4672}
700,1.3743,1.361992,{'accuracy': 0.4676}
800,1.4384,1.345174,{'accuracy': 0.4796}
900,1.4254,1.383178,{'accuracy': 0.454}
1000,1.3833,1.347479,{'accuracy': 0.4752}


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64
Trainer is attempting to log a value of "{'accuracy': 0.3728}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64
Trainer is attempting to log a value of "{'accuracy': 0.4132}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64
Trainer is attempting to log a value of "{'accuracy': 0.4324}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64
Trainer is attempting to log a value of "{'accuracy': 0.438

TrainOutput(global_step=1407, training_loss=1.4373802342309796, metrics={'train_runtime': 384.1637, 'train_samples_per_second': 58.569, 'train_steps_per_second': 3.663, 'total_flos': 2960211985920000.0, 'train_loss': 1.4373802342309796, 'epoch': 1.0})

In [8]:
trainer.save_model("stars_model")

Saving model checkpoint to stars_model
Configuration saved in stars_model\config.json
Model weights saved in stars_model\pytorch_model.bin
tokenizer config file saved in stars_model\tokenizer_config.json
Special tokens file saved in stars_model\special_tokens_map.json


In [23]:
random_id = 834

text = texts_test[random_id]
print(text)
encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

print(f"predicted: {torch.argmax(logits).item()}")
print(f"true: {stars_test[random_id]}")

I really enjoyed this old black and white talkie. At first I didn't recognize Harold Lloyd as Mr. Cobb, a missionary to China coming home to find a wife. There were many twists and turns in Mr. Cobb's attempts to clean up city hall. His methods of making the punishment fit the crime would likely be illegal, but this is not a movie based on reality. This would be a perfect movie for children except that there is female near nudity (pasties only on Grace Bradley)! The old telephones are enchanting. The only fault is a problem typical of the day - Caucasians are used to represent Chinese men. This is offset by the positive way the Chinese are portrayed. They are the wise, good and friendly guys. Trivia - a Bekins truck appears in the movie when the police run out of Black Marias.
predicted: 7
true: 9
