In [32]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import DataCollatorWithPadding

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification

# Load SST-2 dataset from GLUE benchmark
dataset = load_dataset("glue", "sst2")

# Slice the dataset to select only the first 100 entries
dataset['train'] = dataset['train'].select(range(500))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:

# Iterate over each example in the dataset and print its contents
for example in dataset['train']:
    print("Sentence:", example['sentence'])
    print("Label:", example['label'])
    print("Index:", example['idx'])
    print()


Sentence: hide new secretions from the parental units 
Label: 0
Index: 0

Sentence: contains no wit , only labored gags 
Label: 0
Index: 1

Sentence: that loves its characters and communicates something rather beautiful about human nature 
Label: 1
Index: 2

Sentence: remains utterly satisfied to remain the same throughout 
Label: 0
Index: 3

Sentence: on the worst revenge-of-the-nerds clichés the filmmakers could dredge up 
Label: 0
Index: 4

Sentence: that 's far too tragic to merit such superficial treatment 
Label: 0
Index: 5

Sentence: demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . 
Label: 1
Index: 6

Sentence: of saucy 
Label: 1
Index: 7

Sentence: a depressed fifteen-year-old 's suicidal poetry 
Label: 0
Index: 8

Sentence: are more deeply thought through than in most ` right-thinking ' films 
Label: 1
Index: 9

Sentence: goes to absurd lengths 
Label: 0
Index: 10

Sentence: for

In [34]:


# Tokenize input texts
def tokenize(batch):
    return tokenizer(batch["sentence"], padding=True, truncation=True)

dataset = dataset.map(tokenize, batched=True)


# Convert the dataset to PyTorch tensors
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader for training
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(dataset['train'], batch_size=8, shuffle=True, collate_fn=data_collator)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)  # Using AdamW optimizer
criterion = nn.CrossEntropyLoss()  # Assuming cross-entropy loss for classification




Map:   0%|          | 0/500 [00:00<?, ? examples/s]



In [35]:
# Fine-tune BERT
model.train()
epochs = 3  # Adjust as needed
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# Save the fine-tuned model
torch.save(model.state_dict(), 'bert_classification_model.pth')

Epoch 1/3, Average Loss: 0.5392
Epoch 2/3, Average Loss: 0.3380
Epoch 3/3, Average Loss: 0.1705


In [36]:
# Example new input texts
new_texts = [
    "This movie is fantastic!",
    "I didn't like this film at all.",
    "This is a great movie!",
    "I couldn't stop laughing during this film.",
    "I found the movie to be disappointing.",
    "The acting was superb!",
    "The plot was predictable.",
    "I highly recommend this movie to everyone.",
    "I regret watching this film.",
    "The cinematography was breathtaking."
]

# Tokenize new input texts
tokenized_inputs = tokenizer(new_texts, padding=True, truncation=True, return_tensors="pt")

# Prepare input tensors
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]

# Perform inference
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# Interpret model outputs
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1)

# Display predictions with human-readable labels
for text, label in zip(new_texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted Label: {label_map[label.item()]}")
    print()


Text: This movie is fantastic!
Predicted Label: Positive

Text: I didn't like this film at all.
Predicted Label: Negative

Text: This is a great movie!
Predicted Label: Positive

Text: I couldn't stop laughing during this film.
Predicted Label: Negative

Text: I found the movie to be disappointing.
Predicted Label: Negative

Text: The acting was superb!
Predicted Label: Positive

Text: The plot was predictable.
Predicted Label: Negative

Text: I highly recommend this movie to everyone.
Predicted Label: Positive

Text: I regret watching this film.
Predicted Label: Negative

Text: The cinematography was breathtaking.
Predicted Label: Positive



In [38]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification

# Load the saved state dictionary into the model
model.load_state_dict(torch.load('bert_classification_model.pth'))

# Now the 'model' object contains the loaded weights

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [44]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load the saved state dictionary into the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_classification_model.pth'))

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example new input texts
new_texts = [
    "This movie is fantastic!",
    "I didn't like this film at all.",
    "This is a great movie!",
    "I couldn't stop laughing during this film.",
    "I found the movie to be disappointing.",
    "The acting was superb!",
    "The plot was predictable.",
    "I highly recommend this movie to everyone.",
    "I regret watching this film.",
    "The cinematography was breathtaking."
]

# Tokenize new input texts
tokenized_inputs = tokenizer(new_texts, padding=True, truncation=True, return_tensors="pt")

# Prepare input tensors
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]

# Perform inference
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# Interpret model outputs
logits = outputs.logits
predicted_labels = torch.argmax(logits, dim=1)

# Display predictions with human-readable labels
label_map = {0: "Negative", 1: "Positive"}
for text, label in zip(new_texts, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted Label: {label_map[label.item()]}")
    print()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: This movie is fantastic!
Predicted Label: Positive

Text: I didn't like this film at all.
Predicted Label: Negative

Text: This is a great movie!
Predicted Label: Positive

Text: I couldn't stop laughing during this film.
Predicted Label: Positive

Text: I found the movie to be disappointing.
Predicted Label: Negative

Text: The acting was superb!
Predicted Label: Positive

Text: The plot was predictable.
Predicted Label: Negative

Text: I highly recommend this movie to everyone.
Predicted Label: Positive

Text: I regret watching this film.
Predicted Label: Negative

Text: The cinematography was breathtaking.
Predicted Label: Positive

