In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB

In [3]:
from datasets import load_dataset  # Importing the `load_dataset` function from the Hugging Face Datasets library
from transformers import AutoTokenizer, DataCollatorWithPadding  # Importing tools for tokenization and data collation

  from .autonotebook import tqdm as notebook_tqdm
E0000 00:00:1735309732.601995      10 common_lib.cc:798] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:479
E1227 14:28:52.633075135     448 oauth2_credentials.cc:238]            oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {created_time:"2024-12-27T14:28:52.6330418+00:00", grpc_status:2}


In [4]:
# Load the GLUE MRPC dataset
raw_datasets = load_dataset("glue", "mrpc")  # The GLUE MRPC dataset contains sentence pairs for paraphrase detection.
raw_datasets

Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 343999.89 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 136074.75 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 354734.97 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [6]:
raw_datasets['train'][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [7]:
raw_datasets['train'][0]['idx']

0

In [8]:
# Define the checkpoint for the tokenizer
checkpoint = "bert-base-uncased"  # Use the BERT base model with uncased tokenization
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # Load the tokenizer associated with the specified checkpoint

In [9]:
# Define a function to tokenize input examples
def tokenize_function(example):
    # Tokenize the sentence pairs with truncation to fit the model's input size
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Apply the tokenize function to the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Use the map method with `batched=True` to tokenize all examples efficiently in batches

Map: 100%|██████████| 3668/3668 [00:00<00:00, 13844.02 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 12836.53 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 16279.96 examples/s]


In [10]:
tokenized_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [11]:
# Create a data collator for dynamically padding input data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# This ensures input tensors are padded to the longest sequence in a batch, making them ready for model training

In [12]:
# Remove unnecessary columns from the tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# The columns "sentence1", "sentence2", and "idx" are no longer needed after tokenization,
# so we remove them to keep only the relevant data for model training.

# Rename the "label" column to "labels"
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# The model expects the column containing target values to be named "labels",
# so we rename the "label" column accordingly.

# Set the dataset format to PyTorch tensors
tokenized_datasets.set_format("torch")
# This ensures the dataset outputs are in a format compatible with PyTorch during training.

tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [13]:
tokenized_datasets["train"][0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
          2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
          3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
          1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
          2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}

In [14]:
from torch.utils.data import DataLoader  # Import DataLoader to handle batching and shuffling of data

# Create a DataLoader for the training dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],  # Use the training split of the tokenized dataset
    shuffle=True,  # Shuffle the data during training to improve model generalization
    batch_size=16,  # Process data in batches of 8 samples
    collate_fn=data_collator  # Use the data collator for dynamic padding of input sequences
)

# Create a DataLoader for the validation dataset
test_dataloader = DataLoader(
    tokenized_datasets["validation"],  # Use the validation split of the tokenized dataset
    batch_size=16,  # Process data in batches of 8 samples
    collate_fn=data_collator  # Use the same data collator for consistency
)

# The DataLoaders handle the tokenized datasets and ensure the data is prepared for the model
# in the correct format with batching, padding, and shuffling (for training).

In [15]:
for batch in train_dataloader:
    print(batch['labels'].shape)
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['token_type_ids'].shape)
    break  # Break after retrieving the first batch

torch.Size([16])
torch.Size([16, 65])
torch.Size([16, 65])
torch.Size([16, 65])


In [16]:
from transformers import AutoModelForSequenceClassification  # Import the model class for sequence classification

# Load a pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,  # Use the same checkpoint as the tokenizer (e.g., "bert-base-uncased")
    num_labels=2  # Specify the number of labels for the classification task (binary classification in this case)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Pass the batch through the model to get outputs
outputs = model(**batch)
# The double-asterisk (`**`) unpacks the batch dictionary, passing its items as keyword arguments to the model.
# Typical inputs include 'input_ids', 'attention_mask', and 'labels' for training.
# Print the loss and logits from the model's outputs
outputs

SequenceClassifierOutput(loss=tensor(0.6253, grad_fn=<NllLossBackward0>), logits=tensor([[0.0942, 0.5491],
        [0.0561, 0.5852],
        [0.0603, 0.5912],
        [0.0642, 0.5767],
        [0.0598, 0.5903],
        [0.0680, 0.5779],
        [0.0596, 0.5671],
        [0.0794, 0.5937],
        [0.0392, 0.6320],
        [0.0623, 0.5751],
        [0.0525, 0.5794],
        [0.0660, 0.5965],
        [0.0781, 0.5817],
        [0.1130, 0.5657],
        [0.0777, 0.6085],
        [0.0866, 0.5870]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [18]:
from transformers import AdamW  # Import the AdamW optimizer from the Hugging Face transformers library

# Initialize the AdamW optimizer for training
optimizer = AdamW(
    model.parameters(),  # Pass the model's parameters to the optimizer, so it knows which parameters to update
    lr=5e-5  # Set the learning rate to 5e-5 (a commonly used learning rate for fine-tuning transformers)
)



In [19]:
len(train_dataloader)

230

In [20]:
from transformers import get_scheduler  # Import the function to get a learning rate scheduler

# Set the number of training epochs
num_epochs = 3  # The number of times the model will iterate over the entire training dataset

# Calculate the total number of training steps based on the number of epochs and the number of batches per epoch
num_training_steps = num_epochs * len(train_dataloader)

# Initialize the learning rate scheduler to adjust the learning rate during training
lr_scheduler = get_scheduler(
    "linear",  # Use a linear learning rate scheduler, where the learning rate decreases linearly from the initial value
    optimizer=optimizer,  # The optimizer to which the scheduler is applied
    num_warmup_steps=0,  # No warm-up steps, meaning the learning rate starts at the initial value immediately
    num_training_steps=num_training_steps,  # The total number of training steps
)

# Print the total number of training steps to verify the calculation
print(num_training_steps)

690


In [21]:
import torch  
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [22]:
len(train_dataloader.dataset)

3668

In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score

def train(train_dataloader, model, optimizer, lr_scheduler, device):
    model.train()
    total_loss = 0
    total_correct = 0  # Variable to track the number of correct predictions
    total_samples = 0  # Variable to track the total number of samples
    all_predictions = []
    all_labels = []
    
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Collect predictions and labels
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        all_predictions.extend(predictions)
        all_labels.extend(labels)

        # Calculate accuracy for this batch
        correct = (predictions == labels).sum()
        total_correct += correct
        total_samples += len(labels)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if i%20 == 0:
            print(f"Batch {i+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    # Calculate overall metrics for training
    avg_loss = total_loss / len(train_dataloader)
    accuracy = total_correct / total_samples
    precision = precision_score(all_labels, all_predictions, average="weighted")
    recall = recall_score(all_labels, all_predictions, average="weighted")
    f1 = f1_score(all_labels, all_predictions, average="weighted")
    print(f"Training - Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


def test(test_dataloader, model, device):
    model.eval()
    test_loss = 0
    total_correct = 0  # Variable to track the number of correct predictions
    total_samples = 0  # Variable to track the total number of samples
    all_predictions = []
    all_labels = []
    num_batches = len(test_dataloader)

    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            test_loss += outputs.loss.item()

            # Collect predictions and labels
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            all_predictions.extend(predictions)
            all_labels.extend(labels)

            # Calculate accuracy for this batch
            correct = (predictions == labels).sum()
            total_correct += correct
            total_samples += len(labels)

    # Calculate overall metrics for testing
    avg_loss = test_loss / num_batches
    accuracy = total_correct / total_samples
    precision = precision_score(all_labels, all_predictions, average="weighted")
    recall = recall_score(all_labels, all_predictions, average="weighted")
    f1 = f1_score(all_labels, all_predictions, average="weighted")
    print(f"Testing - Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


# Main training/testing loop
epochs = 2
for t in range(epochs):
    print(f"Epoch {t+1}/{epochs}")
    train(train_dataloader, model, optimizer, lr_scheduler, device)
    test(test_dataloader, model, device)

Epoch 1/2
Batch 1/230, Loss: 0.5984
Batch 21/230, Loss: 0.5772
Batch 41/230, Loss: 0.4018
Batch 61/230, Loss: 0.6699
Batch 81/230, Loss: 0.4037
Batch 101/230, Loss: 0.4855
Batch 121/230, Loss: 0.4708
Batch 141/230, Loss: 0.4643
Batch 161/230, Loss: 0.4732
Batch 181/230, Loss: 0.5547
Batch 201/230, Loss: 0.4719
Batch 221/230, Loss: 0.6496
Training - Avg Loss: 0.5092, Accuracy: 0.7563, Precision: 0.7466, Recall: 0.7563, F1 Score: 0.7451
Testing - Avg Loss: 0.4015, Accuracy: 0.8309, Precision: 0.8378, Recall: 0.8309, F1 Score: 0.8332
Epoch 2/2
Batch 1/230, Loss: 0.2039
Batch 21/230, Loss: 0.1024
Batch 41/230, Loss: 0.1668
Batch 61/230, Loss: 0.1761
Batch 81/230, Loss: 0.0955
Batch 101/230, Loss: 0.5019
Batch 121/230, Loss: 0.3485
Batch 141/230, Loss: 0.3453
Batch 161/230, Loss: 0.0677
Batch 181/230, Loss: 0.1383
Batch 201/230, Loss: 0.1538
Batch 221/230, Loss: 0.1935
Training - Avg Loss: 0.2463, Accuracy: 0.9081, Precision: 0.9078, Recall: 0.9081, F1 Score: 0.9080
Testing - Avg Loss: 0.35

In [24]:
import os

save_path = "/kaggle/working/model"
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/kaggle/working/model/tokenizer_config.json',
 '/kaggle/working/model/special_tokens_map.json',
 '/kaggle/working/model/vocab.txt',
 '/kaggle/working/model/added_tokens.json',
 '/kaggle/working/model/tokenizer.json')

In [25]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/model")

In [28]:
def make_prediction(text1, text2):
    inputs = tokenizer(text1, text2, truncation=True, padding=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return prediction

# Example usage Check  2 sentences are equal
text1 = "The quick brown fox jumps over the lazy dog."
text2 = "A fast brown fox leaps over a lazy dog."
prediction = make_prediction(text1, text2)

print(f"Prediction: {prediction}")

Prediction: 1
