In [1]:
import os
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Create directories for saving data and model
os.makedirs("./data", exist_ok=True)
os.makedirs("./model", exist_ok=True)

In [4]:
# Load the IMDB dataset and save locally
dataset = load_dataset("imdb")
dataset.save_to_disk("./data/imdb")

Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 530449.17 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 880815.82 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 915319.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25000/25000 [00:00<00:00, 2756943.79 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25000/25000 [00:00<00:00, 2832305.12 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50000/50000 [00:00<00:00, 2594329.26 examples/s]


In [5]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=512)

In [6]:
# Tokenize and preprocess dataset
encoded_dataset = dataset.map(tokenize_data, batched=True)
encoded_dataset.save_to_disk("./data/tokenized_imdb")

Map: 100%|██████████| 25000/25000 [00:43<00:00, 581.14 examples/s]
Map: 100%|██████████| 25000/25000 [00:42<00:00, 592.75 examples/s]
Map: 100%|██████████| 50000/50000 [01:26<00:00, 575.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25000/25000 [00:00<00:00, 98538.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25000/25000 [00:00<00:00, 98093.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50000/50000 [00:00<00:00, 102574.00 examples/s]


In [7]:
# Split dataset
train_dataset = encoded_dataset['train']
test_dataset = encoded_dataset['test']


In [2]:
# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./model/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./model/logs",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    load_best_model_at_end=True
)



In [4]:
# Define compute metrics function
def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [5]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
results = trainer.evaluate()
print(results)


NameError: name 'train_dataset' is not defined

In [None]:
# Save the trained model locally
model.save_pretrained("./model/sentiment_model")

# Load model for inference
sentiment_pipeline = pipeline("text-classification", model="./model/sentiment_model")
result = sentiment_pipeline("I love this movie!")
print(result)  # [{'label': 'POSITIVE', 'score': 0.99}]
