In [1]:
from transformers import BertTokenizer, BertForNextSentencePrediction, Trainer, TrainingArguments
import torch
import pandas as pd

In [2]:
import pandas as pd
from datasets import Dataset
import random

df = pd.read_csv("/content/sst2_train.csv")

pairs = []
for i in range(len(df)-1):
    # Positive pair: next sentence
    pairs.append({
        "sentence1": df.loc[i, "sentence"],
        "sentence2": df.loc[i+1, "sentence"],
        "label": 1
    })

    # Negative pair: random sentence
    rand_idx = random.randint(0, len(df)-1)
    if rand_idx == i+1:
        rand_idx = (rand_idx + 1) % len(df)
    pairs.append({
        "sentence1": df.loc[i, "sentence"],
        "sentence2": df.loc[rand_idx, "sentence"],
        "label": 0
    })

df_pairs = pd.DataFrame(pairs)
train_dataset = Dataset.from_pandas(df_pairs)


In [3]:
df_pairs.head()

Unnamed: 0,sentence1,sentence2,label
0,hide new secretions from the parental units,"contains no wit , only labored gags",1
1,hide new secretions from the parental units,the bai brothers have taken an small slice of ...,0
2,"contains no wit , only labored gags",that loves its characters and communicates som...,1
3,"contains no wit , only labored gags",more and more frustrating,0
4,that loves its characters and communicates som...,remains utterly satisfied to remain the same t...,1


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode(batch):
    return tokenizer(batch['sentence1'], batch['sentence2'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(encode, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/134696 [00:00<?, ? examples/s]

In [5]:
model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [12]:
training_args = TrainingArguments(
    output_dir='./bert_nsp_glue',
    num_train_epochs=3,
    per_device_train_batch_size=64,   # higher batch size = faster
    learning_rate=2e-5,

    fp16=True,                        # mixed precision = MAJOR speed boost
    optim="adamw_torch_fused",        # fastest AdamW variant
    dataloader_num_workers=4,         # faster data loading
    logging_steps=50,
    save_steps=1000,
    eval_strategy="no",
    save_total_limit=2,
    remove_unused_columns=False,
    gradient_accumulation_steps=1,    # increase if you want even bigger "virtual batch"
)


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Step,Training Loss
50,0.6911
100,0.6934
150,0.6931
200,0.6942
250,0.6934
300,0.6935
350,0.6951
400,0.6937
450,0.6944
500,0.6929




TrainOutput(global_step=6315, training_loss=0.6935659789132959, metrics={'train_runtime': 2246.1211, 'train_samples_per_second': 179.905, 'train_steps_per_second': 2.812, 'total_flos': 2.658000503457792e+16, 'train_loss': 0.6935659789132959, 'epoch': 3.0})

In [23]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def predict_next_sentences(context, candidates, top_k=5):
    model.eval()
    results = []

    for candidate in candidates:
        text = (context, candidate)  # NSP needs sentence pairs

        inputs = tokenizer(
            text[0],
            text[1],
            return_tensors="pt",
            truncation=True,
            max_length=512
        )

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits  # shape [1, 2]
        probs = torch.softmax(logits, dim=-1)[0]

        is_next_prob = probs[0].item()  # class 0 = "IsNext"

        results.append((candidate, is_next_prob))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]


In [24]:
context = "The cat sat on the mat."
candidates = [
"It looked very comfortable.",
"I love ice cream.",
"Then it jumped onto the couch.",
"She went shopping yesterday.",
"Birds are flying in the sky.",
"It started to rain heavily."
]

top5 = predict_next_sentences(context, candidates)
print("Top 5 probable next sentences:")
for sent, prob in top5:
  print(f"{sent} -> {prob:.3f}")

Top 5 probable next sentences:
Then it jumped onto the couch. -> 0.515
It looked very comfortable. -> 0.515
She went shopping yesterday. -> 0.512
It started to rain heavily. -> 0.507
Birds are flying in the sky. -> 0.505
