In [None]:
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)


imdb_dataset = load_dataset("imdb")

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def encode_text(batch):
    return bert_tokenizer(
        batch["text"],
        padding=True,
        truncation=True
    )


encoded_dataset = imdb_dataset.map(encode_text, batched=True)

bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)


train_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs",
)


trainer = Trainer(
    model=bert_model,
    args=train_args,
    train_dataset=encoded_dataset["train"]
        .shuffle(seed=42)
        .select(range(2000)),
    eval_dataset=encoded_dataset["test"]
        .select(range(1000)),
)


trainer.train()


In [None]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input="data.txt",
    model_prefix="wids_tokenizer",
    vocab_size=2000,
    model_type="bpe"
)

print("SentencePiece tokenizer trained successfully.")


In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

result = classifier("Transformers are amazing!")
print(result)