In [1]:
!pip install transformers datasets torch tensorflow numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
from tensorflow.keras.datasets import imdb
import numpy as np

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)


def get_balanced_subset(X, y, num_samples_per_class):
    pos_idx = np.where(y == 1)[0][:num_samples_per_class]
    neg_idx = np.where(y == 0)[0][:num_samples_per_class]
    selected_idx = np.concatenate([pos_idx, neg_idx])
    np.random.shuffle(selected_idx)
    return X[selected_idx], y[selected_idx]


X_train_sub, y_train_sub = get_balanced_subset(X_train, y_train, 2500)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [3]:
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(encoded_review):

    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])


X_train_sub_text = [decode_review(review) for review in X_train_sub]


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_inputs = tokenizer(
    X_train_sub_text,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)


In [5]:
import torch
from torch.utils.data import Dataset

class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(tokenized_inputs, y_train_sub)

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [8]:
trainer.train()

Step,Training Loss
100,0.4736
200,0.3245
300,0.266
400,0.2064
500,0.1787
600,0.1853
700,0.1383
800,0.091
900,0.1087


TrainOutput(global_step=939, training_loss=0.21408770818172043, metrics={'train_runtime': 689.4144, 'train_samples_per_second': 21.758, 'train_steps_per_second': 1.362, 'total_flos': 1987010979840000.0, 'train_loss': 0.21408770818172043, 'epoch': 3.0})

In [9]:
trainer.save_model("my_imdb_model")
tokenizer.save_pretrained("my_imdb_model")

('my_imdb_model/tokenizer_config.json',
 'my_imdb_model/special_tokens_map.json',
 'my_imdb_model/vocab.txt',
 'my_imdb_model/added_tokens.json',
 'my_imdb_model/tokenizer.json')

In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("my_imdb_model")
tokenizer = AutoTokenizer.from_pretrained("my_imdb_model")

In [11]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [12]:
reviews = [
    "Absolutely loved this movie!",
    "This film was boring and disappointing."
]
outputs = classifier(reviews)
print(outputs)

[{'label': 'LABEL_1', 'score': 0.9903002977371216}, {'label': 'LABEL_0', 'score': 0.9837744235992432}]


In [13]:
X_test_sub, y_test_sub = get_balanced_subset(X_test, y_test, 500)

X_test_sub_text = [decode_review(review) for review in X_test_sub]

tokenized_test_inputs = tokenizer(
    X_test_sub_text,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

eval_dataset = IMDbDataset(tokenized_test_inputs, y_test_sub)

In [14]:
results = trainer.evaluate(eval_dataset=eval_dataset)

print(results)

{'eval_loss': 0.32911548018455505, 'eval_runtime': 13.3601, 'eval_samples_per_second': 74.85, 'eval_steps_per_second': 9.356, 'epoch': 3.0}


In [19]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [20]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):

    metric = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [22]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [23]:
results = trainer.evaluate()
print(results)

Downloading builder script: 0.00B [00:00, ?B/s]

{'eval_loss': 0.32911548018455505, 'eval_model_preparation_time': 0.0094, 'eval_accuracy': 0.907, 'eval_runtime': 15.2158, 'eval_samples_per_second': 65.721, 'eval_steps_per_second': 8.215}


In [24]:
accuracy = results['eval_accuracy']

print(f"The model accuracy is: {accuracy * 100:.2f}%")

The model accuracy is: 90.70%
