In [None]:
!pip install transformers datasets torch tensorflow numpy

In [None]:
from tensorflow.keras.datasets import imdb
import numpy as np

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)


def get_balanced_subset(X, y, num_samples_per_class):
    pos_idx = np.where(y == 1)[0][:num_samples_per_class]
    neg_idx = np.where(y == 0)[0][:num_samples_per_class]
    selected_idx = np.concatenate([pos_idx, neg_idx])
    np.random.shuffle(selected_idx)
    return X[selected_idx], y[selected_idx]


X_train_sub, y_train_sub = get_balanced_subset(X_train, y_train, 2500)


In [None]:
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(encoded_review):

    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])


X_train_sub_text = [decode_review(review) for review in X_train_sub]


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_inputs = tokenizer(
    X_train_sub_text,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)


In [None]:
import torch
from torch.utils.data import Dataset

class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(tokenized_inputs, y_train_sub)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("my_imdb_model")
tokenizer.save_pretrained("my_imdb_model")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("my_imdb_model")
tokenizer = AutoTokenizer.from_pretrained("my_imdb_model")

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
reviews = [
    "Absolutely loved this movie!",
    "This film was boring and disappointing."
]
outputs = classifier(reviews)
print(outputs)

In [None]:
X_test_sub, y_test_sub = get_balanced_subset(X_test, y_test, 500)

X_test_sub_text = [decode_review(review) for review in X_test_sub]

tokenized_test_inputs = tokenizer(
    X_test_sub_text,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

eval_dataset = IMDbDataset(tokenized_test_inputs, y_test_sub)

In [None]:
results = trainer.evaluate(eval_dataset=eval_dataset)

print(results)

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):

    metric = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
accuracy = results['eval_accuracy']

print(f"The model accuracy is: {accuracy * 100:.2f}%")