In [2]:
import pandas as pd

df = pd.read_csv("./IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
from datasets import Dataset

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Load the pre-trained DistilBERT model for sequence classif                ication
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # 2 labels for binary classification

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 50000/50000 [02:40<00:00, 310.68 examples/s]


In [6]:
# Map string labels to integers
label_map = {"negative": 0, "positive": 1}

# Add numeric labels to the dataset
tokenized_datasets = tokenized_datasets.map(lambda x: {"labels": label_map[x["sentiment"]]})

Map: 100%|██████████| 50000/50000 [00:01<00:00, 25304.40 examples/s]


In [7]:
# Inspect the tokenized dataset
print(tokenized_datasets[0])

{'review': "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is du

In [8]:
import torch
from sklearn.model_selection import train_test_split

# Split the dataset (e.g., 80% training, 20% validation)
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Set the dataset format to PyTorch tensors
train_dataset.set_format("torch")
val_dataset.set_format("torch")

In [None]:
%pip install tf-keras


In [None]:
%pip install transformers[torch] accelerate>=0.26.0


Note: you may need to restart the kernel to use updated packages.


install library for transform Trainer
- pip install tf-keras
- pip install transformers[torch]

In [None]:
from transformers import TrainingArguments
from transformers import Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,  # Learning rate for AdamW
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,  # Weight decay for AdamW
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=True,  # Enable mixed precision for faster training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Validation dataset
)




In [None]:
# Fine-tune the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

Epoch,Training Loss,Validation Loss
1,0.205,0.180477
2,0.1384,0.215181
3,0.0779,0.262677


{'eval_loss': 0.26267731189727783, 'eval_runtime': 28.1907, 'eval_samples_per_second': 354.727, 'eval_steps_per_second': 22.17, 'epoch': 3.0}


In [None]:
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-distilbert")
tokenizer.save_pretrained("./fine-tuned-distilbert")

('./fine-tuned-distilbert\\tokenizer_config.json',
 './fine-tuned-distilbert\\special_tokens_map.json',
 './fine-tuned-distilbert\\vocab.txt',
 './fine-tuned-distilbert\\added_tokens.json')

If cannot use Transformer Trainer

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import accuracy_score
import torch
from tqdm import tqdm

# Set the dataset format to PyTorch tensors
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Create DataLoader for training and validation
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Define learning rate scheduler
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Training loop
model.train()
for epoch in range(3):  # 3 epochs
    print(f"Epoch {epoch + 1}/{3}")
    progress_bar = tqdm(train_dataloader, desc="Training")

    for batch in progress_bar:
        # Move batch to GPU (if available)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update progress bar
        progress_bar.set_postfix({"loss": loss.item()})

# Evaluation loop
model.eval()
val_loss = 0
val_preds, val_labels = [], []

for batch in val_dataloader:
    # Move batch to GPU (if available)
    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass (no gradient calculation)
    with torch.no_grad():
        outputs = model(**batch)

    # Compute loss
    val_loss += outputs.loss.item()

    # Get predictions
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    val_preds.extend(preds.cpu().numpy())
    val_labels.extend(batch["labels"].cpu().numpy())

# Compute validation accuracy
val_accuracy = accuracy_score(val_labels, val_preds)
val_loss /= len(val_dataloader)

print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'train_dataset' is not defined

In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine-tuned-distilbert")
tokenizer.save_pretrained("./fine-tuned-distilbert")

('./fine-tuned-distilbert\\tokenizer_config.json',
 './fine-tuned-distilbert\\special_tokens_map.json',
 './fine-tuned-distilbert\\vocab.txt',
 './fine-tuned-distilbert\\added_tokens.json')

In [None]:
import tensorflow as tf

tf.config.experimental.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]