In [1]:
%pip install transformers datasets accelerate


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

# Login to Hugging Face Hub (run this once, it will prompt for your token)
from huggingface_hub import notebook_login
notebook_login()

# Load SST-2
dataset = load_dataset("glue", "sst2")

# Show dataset structure
print(dataset)

# Show split sizes
print(dataset.keys())
print(len(dataset["train"]), len(dataset["validation"]), len(dataset["test"]))


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})
dict_keys(['train', 'validation', 'test'])
67349 872 1821


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize(batch):
    return tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=128)

encoded_dataset = dataset.map(tokenize, batched=True)

# Set format for PyTorch
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert-sst2",
    eval_strategy="epoch",  # changed from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer
)





  trainer = Trainer(


In [7]:
trainer.train()


KeyboardInterrupt: 

In [None]:
from transformers import BertModel
import torch

# Load the fine-tuned model (but without classification head for embeddings)
bert = BertModel.from_pretrained("./bert-sst2/checkpoint-XXX")  # replace with your checkpoint

def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert(**inputs)
    cls_embedding = outputs.last_hidden_state[:,0,:]  # CLS token
    return cls_embedding

# Example
print(get_sentence_embedding("This movie is fantastic!").shape)  # torch.Size([1,768])


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="bert-base-uncased", tokenizer=tokenizer)

print(classifier("I love this movie!"))
print(classifier("This film is terrible."))
