In [66]:
!pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [67]:
from datasets import load_dataset  # Importing the `load_dataset` function from the Hugging Face Datasets library
from transformers import AutoTokenizer, DataCollatorWithPadding  # Importing tools for tokenization and data collation

In [68]:
# Load the GLUE MRPC dataset
raw_datasets = load_dataset("glue", "mrpc")  # The GLUE MRPC dataset contains sentence pairs for paraphrase detection.
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [69]:
raw_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [70]:
raw_datasets['train'][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [71]:
raw_datasets['train'][0]['idx']

0

In [72]:
# Define the checkpoint for the tokenizer
checkpoint = "bert-base-uncased"  # Use the BERT base model with uncased tokenization
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # Load the tokenizer associated with the specified checkpoint

In [73]:
# Define a function to tokenize input examples
def tokenize_function(example):
    # Tokenize the sentence pairs with truncation to fit the model's input size
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Apply the tokenize function to the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Use the map method with `batched=True` to tokenize all examples efficiently in batches

Map: 100%|██████████| 1725/1725 [00:00<00:00, 15963.30 examples/s]


In [74]:
tokenized_datasets['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [75]:
# Create a data collator for dynamically padding input data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# This ensures input tensors are padded to the longest sequence in a batch, making them ready for model training

In [76]:
# Remove unnecessary columns from the tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# The columns "sentence1", "sentence2", and "idx" are no longer needed after tokenization,
# so we remove them to keep only the relevant data for model training.

# Rename the "label" column to "labels"
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# The model expects the column containing target values to be named "labels",
# so we rename the "label" column accordingly.

# Set the dataset format to PyTorch tensors
tokenized_datasets.set_format("torch")
# This ensures the dataset outputs are in a format compatible with PyTorch during training.

tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [77]:
tokenized_datasets["train"][0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
          2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
          3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
          1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
          2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}

In [78]:
from torch.utils.data import DataLoader  # Import DataLoader to handle batching and shuffling of data

# Create a DataLoader for the training dataset
train_dataloader = DataLoader(
    tokenized_datasets["train"],  # Use the training split of the tokenized dataset
    shuffle=True,  # Shuffle the data during training to improve model generalization
    batch_size=16,  # Process data in batches of 8 samples
    collate_fn=data_collator  # Use the data collator for dynamic padding of input sequences
)

# Create a DataLoader for the validation dataset
test_dataloader = DataLoader(
    tokenized_datasets["validation"],  # Use the validation split of the tokenized dataset
    batch_size=16,  # Process data in batches of 8 samples
    collate_fn=data_collator  # Use the same data collator for consistency
)

# The DataLoaders handle the tokenized datasets and ensure the data is prepared for the model
# in the correct format with batching, padding, and shuffling (for training).

In [79]:
for batch in train_dataloader:
    print(batch['labels'].shape)
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['token_type_ids'].shape)
    break  # Break after retrieving the first batch

torch.Size([16])
torch.Size([16, 66])
torch.Size([16, 66])
torch.Size([16, 66])


In [80]:
from transformers import AutoModelForSequenceClassification  # Import the model class for sequence classification

# Load a pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,  # Use the same checkpoint as the tokenizer (e.g., "bert-base-uncased")
    num_labels=2  # Specify the number of labels for the classification task (binary classification in this case)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
# Pass the batch through the model to get outputs
outputs = model(**batch)
# The double-asterisk (`**`) unpacks the batch dictionary, passing its items as keyword arguments to the model.
# Typical inputs include 'input_ids', 'attention_mask', and 'labels' for training.
# Print the loss and logits from the model's outputs
outputs

SequenceClassifierOutput(loss=tensor(0.4172, grad_fn=<NllLossBackward0>), logits=tensor([[-0.7412,  0.4300],
        [-0.7542,  0.4315],
        [-0.7060,  0.4024],
        [-0.7343,  0.4190],
        [-0.7322,  0.4252],
        [-0.7553,  0.4364],
        [-0.7518,  0.4171],
        [-0.7043,  0.4212],
        [-0.7555,  0.4346],
        [-0.7553,  0.4268],
        [-0.7575,  0.4353],
        [-0.7523,  0.4288],
        [-0.7334,  0.4182],
        [-0.7591,  0.4291],
        [-0.7574,  0.4151],
        [-0.7537,  0.4197]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [82]:
from transformers import AdamW  # Import the AdamW optimizer from the Hugging Face transformers library

# Initialize the AdamW optimizer for training
optimizer = AdamW(
    model.parameters(),  # Pass the model's parameters to the optimizer, so it knows which parameters to update
    lr=5e-5  # Set the learning rate to 5e-5 (a commonly used learning rate for fine-tuning transformers)
)



In [83]:
len(train_dataloader)

230

In [84]:
from transformers import get_scheduler  # Import the function to get a learning rate scheduler

# Set the number of training epochs
num_epochs = 3  # The number of times the model will iterate over the entire training dataset

# Calculate the total number of training steps based on the number of epochs and the number of batches per epoch
num_training_steps = num_epochs * len(train_dataloader)

# Initialize the learning rate scheduler to adjust the learning rate during training
lr_scheduler = get_scheduler(
    "linear",  # Use a linear learning rate scheduler, where the learning rate decreases linearly from the initial value
    optimizer=optimizer,  # The optimizer to which the scheduler is applied
    num_warmup_steps=0,  # No warm-up steps, meaning the learning rate starts at the initial value immediately
    num_training_steps=num_training_steps,  # The total number of training steps
)

# Print the total number of training steps to verify the calculation
print(num_training_steps)

690


In [85]:
import torch  
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [86]:
len(train_dataloader.dataset)

3668

In [87]:
from sklearn.metrics import precision_score, recall_score, f1_score

def train(train_dataloader, model, optimizer, lr_scheduler, device):
    model.train()
    total_loss = 0
    total_correct = 0  # Variable to track the number of correct predictions
    total_samples = 0  # Variable to track the total number of samples
    all_predictions = []
    all_labels = []
    
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Collect predictions and labels
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        all_predictions.extend(predictions)
        all_labels.extend(labels)

        # Calculate accuracy for this batch
        correct = (predictions == labels).sum()
        total_correct += correct
        total_samples += len(labels)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if i%20 == 0:
            print(f"Batch {i+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    # Calculate overall metrics for training
    avg_loss = total_loss / len(train_dataloader)
    accuracy = total_correct / total_samples
    precision = precision_score(all_labels, all_predictions, average="weighted")
    recall = recall_score(all_labels, all_predictions, average="weighted")
    f1 = f1_score(all_labels, all_predictions, average="weighted")
    print(f"Training - Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


def test(test_dataloader, model, device):
    model.eval()
    test_loss = 0
    total_correct = 0  # Variable to track the number of correct predictions
    total_samples = 0  # Variable to track the total number of samples
    all_predictions = []
    all_labels = []
    num_batches = len(test_dataloader)

    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            test_loss += outputs.loss.item()

            # Collect predictions and labels
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            all_predictions.extend(predictions)
            all_labels.extend(labels)

            # Calculate accuracy for this batch
            correct = (predictions == labels).sum()
            total_correct += correct
            total_samples += len(labels)

    # Calculate overall metrics for testing
    avg_loss = test_loss / num_batches
    accuracy = total_correct / total_samples
    precision = precision_score(all_labels, all_predictions, average="weighted")
    recall = recall_score(all_labels, all_predictions, average="weighted")
    f1 = f1_score(all_labels, all_predictions, average="weighted")
    print(f"Testing - Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


# Main training/testing loop
epochs = 2
for t in range(epochs):
    print(f"Epoch {t+1}/{epochs}")
    train(train_dataloader, model, optimizer, lr_scheduler, device)
    test(test_dataloader, model, device)

Epoch 1/2
Batch 1/230, Loss: 0.4690
Batch 21/230, Loss: 0.5483
Batch 41/230, Loss: 0.6943
Batch 61/230, Loss: 0.5322
Batch 81/230, Loss: 0.4847
Batch 101/230, Loss: 0.6771
Batch 121/230, Loss: 0.4983
Batch 141/230, Loss: 0.5697
Batch 161/230, Loss: 0.4624
Batch 181/230, Loss: 0.4988
Batch 201/230, Loss: 0.3310
Batch 221/230, Loss: 0.6791
Training - Avg Loss: 0.5443, Accuracy: 0.7345, Precision: 0.7228, Recall: 0.7345, F1 Score: 0.7083
Testing - Avg Loss: 0.3976, Accuracy: 0.8260, Precision: 0.8226, Recall: 0.8260, F1 Score: 0.8193
Epoch 2/2
Batch 1/230, Loss: 0.2865
Batch 21/230, Loss: 0.2091
Batch 41/230, Loss: 0.2962
Batch 61/230, Loss: 0.2174
Batch 81/230, Loss: 0.2649
Batch 101/230, Loss: 0.2068
Batch 121/230, Loss: 0.1195
Batch 141/230, Loss: 0.2267
Batch 161/230, Loss: 0.1033
Batch 181/230, Loss: 0.1102
Batch 201/230, Loss: 0.1705
Batch 221/230, Loss: 0.3569
Training - Avg Loss: 0.2756, Accuracy: 0.8934, Precision: 0.8925, Recall: 0.8934, F1 Score: 0.8927
Testing - Avg Loss: 0.35

In [88]:
import os

save_path = "/kaggle/working/model"
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/kaggle/working/model/tokenizer_config.json',
 '/kaggle/working/model/special_tokens_map.json',
 '/kaggle/working/model/vocab.txt',
 '/kaggle/working/model/added_tokens.json',
 '/kaggle/working/model/tokenizer.json')

In [89]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/model")

In [90]:
def make_prediction(text1, text2):
    inputs = tokenizer(text1, text2, truncation=True, padding=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return prediction

# Example usage Check  2 sentences are equal
text1 = "The quick brown fox jumps over the lazy dog."
text2 = "A fast brown fox leaps over a lazy dog."
prediction = make_prediction(text1, text2)

print(f"Prediction: {prediction}")

Prediction: 1


In [91]:
!pip install ipywidgets

from huggingface_hub import notebook_login

notebook_login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [92]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="/kaggle/working/model/config.json",  # Path to the file
    path_in_repo="config.json",  # Path where the file will be stored in the repo
    repo_id="Parit1/dummy",  # Replace with your repository ID
)
upload_file(
    path_or_fileobj="/kaggle/working/model/model.safetensors",  # Path to the file
    path_in_repo="model.safetensors",  # Path where the file will be stored in the repo
    repo_id="Parit1/dummy",  # Replace with your repository ID
)
upload_file(
    path_or_fileobj="/kaggle/working/model/special_tokens_map.json",  # Path to the file
    path_in_repo="special_tokens_map.json",  # Path where the file will be stored in the repo
    repo_id="Parit1/dummy",  # Replace with your repository ID
)
upload_file(
    path_or_fileobj="/kaggle/working/model/tokenizer.json",  # Path to the file
    path_in_repo="tokenizer.json",  # Path where the file will be stored in the repo
    repo_id="Parit1/dummy",  # Replace with your repository ID
)
upload_file(
    path_or_fileobj="/kaggle/working/model/tokenizer_config.json",  # Path to the file
    path_in_repo="tokenizer_config.json",  # Path where the file will be stored in the repo
    repo_id="Parit1/dummy",  # Replace with your repository ID
)
upload_file(
    path_or_fileobj="/kaggle/working/model/vocab.txt",  # Path to the file
    path_in_repo="vocab.txt",  # Path where the file will be stored in the repo
    repo_id="Parit1/dummy",  # Replace with your repository ID
)

No files have been modified since last commit. Skipping to prevent empty commit.
model.safetensors: 100%|██████████| 438M/438M [00:10<00:00, 42.0MB/s] 
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Parit1/dummy/commit/f0204ee538826d689d54d28ca84bdfa2a6b057b4', commit_message='Upload vocab.txt with huggingface_hub', commit_description='', oid='f0204ee538826d689d54d28ca84bdfa2a6b057b4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Parit1/dummy', endpoint='https://huggingface.co', repo_type='model', repo_id='Parit1/dummy'), pr_revision=None, pr_num=None)

In [95]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("Parit1/dummy")
tokenizer = AutoTokenizer.from_pretrained("Parit1/dummy")

def make_prediction(text1, text2):
    inputs = tokenizer(text1, text2, truncation=True, padding=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return prediction

# Example usage Check  2 sentences are equal
text1 = "The quick brown fox jumps over the lazy dog."
text2 = "A fast brown fox leaps over a lazy dog."
prediction = make_prediction(text1, text2)

print(f"Prediction: {prediction}")

Prediction: 1
