# Setup: Importing Libraries
In this cell, we import the necessary libraries for model training and tokenization. 
We use `torch` for handling the neural network, `transformers` for BERT tokenization and classification, and `sklearn` for splitting the dataset.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
import time
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

# Installing Required Packages
Here we install the additional necessary packages, such as `datasets` for handling dataset loading, and `onnx` for exporting the trained PyTorch model into ONNX format for deployment.

In [None]:
!pip install datasets onnx

# Custom Dataset Class
This cell defines a `CustomDataset` class to handle the text and label processing.
The dataset will tokenize each text, ensuring it has a fixed maximum length, and return the corresponding tokenized input IDs, attention mask, and labels.

In [3]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



# Freezing BERT Layers
The following function freezes the first `num_layers_to_freeze` layers of the BERT model during training. This is useful when fine-tuning models, as it helps to focus on training the classifier head without modifying the underlying pretrained BERT model.

In [4]:
def freeze_bert_layers(model, num_layers_to_freeze):
    """
    Freezes the specified number of layers from the bottom of the BERT model.
    Args:
        model: The BERT model
        num_layers_to_freeze: Number of layers to freeze (counting from bottom)
    """
    # Freeze embeddings
    for param in model.bert.embeddings.parameters():
        param.requires_grad = False

    # Freeze the specified number of encoder layers
    for layer in model.bert.encoder.layer[:num_layers_to_freeze]:
        for param in layer.parameters():
            param.requires_grad = False

    # Print trainable parameters info
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")


# Training Loop
This function handles training of the model for a specified number of epochs.
It uses the AdamW optimizer and a linear learning rate scheduler to adjust the learning rate over time. The model is trained on batches of data and validated on a separate validation dataset after each epoch.

In [5]:
def train_model(model, train_loader, val_loader, device, num_epochs=3):
    # Initialize optimizer only with parameters that require gradients
    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=2e-5
    )

    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    for epoch in range(num_epochs):
        start_time = time.time()
        model.train()
        total_train_loss = 0
        right_predictions = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            # Calculate accuracy
            preds = torch.argmax(outputs.logits, dim=1)
            right_predictions += torch.sum(preds == labels).item()

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        # Validation
        model.eval()
        total_val_loss = 0
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1)
                predictions.extend(preds.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        accuracy = np.mean(np.array(predictions) == np.array(true_labels))
        end_time = time.time()

        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
        print(f'Right predictions: {right_predictions} out of {len(train_loader) * 32}')
        print(f'Validation Accuracy: {accuracy:.4f}')
        print(f'Time taken for epoch: {end_time - start_time:.2f} seconds')
        print('-' * 60)


# Converting PyTorch Model to ONNX
The next function converts the trained PyTorch model into ONNX format, a format optimized for deployment and inference.

In [6]:
import tempfile, onnx


def convert_pytorch_to_onnx_with_tokenizer(model, tokenizer, max_length=128, onnx_file_path=None):
    """
    Converts a PyTorch model to ONNX format, using tokenizer output as input.

    Args:
    model (torch.nn.Module): The PyTorch model to be converted.
    tokenizer: The tokenizer used to preprocess the input.
    onnx_file_path (str): The file path where the ONNX model will be saved.
    max_length (int): Maximum sequence length for the tokenizer.

    Returns:
    None
    """
    model.eval()

    # Prepare dummy input using the tokenizer
    dummy_input = "This is a sample input text for ONNX conversion."
    inputs = tokenizer(
        dummy_input,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # # Get the input names
    input_names = list(inputs.keys())
    input_names = ["input_ids", "attention_mask"]
    print(f"Input names: {input_names}")

    # # Create dummy inputs for ONNX export
    # dummy_inputs = tuple(encoded_input[name] for name in input_names)
    if onnx_file_path is None:
      onnx_file_path = tempfile.mktemp(suffix=".onnx")
    dynamic_axes = {name: {0: "batch_size"} for name in input_names}
    dynamic_axes.update({f"logits": {0: "batch_size"}})
    print(f"dynamic_axes: {dynamic_axes}")
    # Export the model
    torch.onnx.export(
        model,  # model being run
        tuple(inputs[k] for k in input_names),  # model inputs
        onnx_file_path,  # where to save the model
        export_params=True,  # store the trained parameter weights inside the model file
        opset_version=20,  # the ONNX version to export the model to
        do_constant_folding=True,  # whether to execute constant folding for optimization
        input_names=input_names,  # the model's input names
        output_names=["logits"],  # the model's output names
        dynamic_axes=dynamic_axes,
    )  # variable length axes

    print(f"Model exported to {onnx_file_path}")

    # Verify the exported model
    onnx_model = onnx.load(onnx_file_path)
    onnx.checker.check_model(onnx_model)
    print("ONNX model is valid.")
    return onnx_file_path, input_names




# Model and Tokenizer Initialization
In this step, we initialize the `AutoTokenizer` and `AutoModelForSequenceClassification` using a pre-trained `bert-tiny` model. This model will be fine-tuned for binary sentiment classification.

In [None]:
# Initialize tokenizer and model
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  # Binary classification for sentiment
)

# Model Training

This code you shared integrates several key functionalities to fine-tune a BERT model for sentiment analysis on the IMDB dataset, converts it to ONNX format, and evaluates it on example sentences. Here’s a refined and more organized version:

In [None]:
def main(model, tokenizer):
    # Load IMDB dataset
    dataset = load_dataset("stanfordnlp/imdb")

    # Prepare train and validation datasets
    train_texts = dataset['train']['text']
    train_labels = dataset['train']['label']

    # Split training data to create a validation set
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
    )

    # Create datasets
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset, batch_size=32, shuffle=True, num_workers=2
    )
    val_loader = DataLoader(
        val_dataset, batch_size=32, num_workers=2
    )

    # Set device and move data to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Train the model
    train_model(model, train_loader, val_loader, device, num_epochs=3)

    # Make model tensors contiguous and move to CPU before saving
    model = model.cpu()

    # Save the fine-tuned model as an ONNX file
    onnx_file_path, input_names = convert_pytorch_to_onnx_with_tokenizer(
        model, tokenizer, max_length=128, onnx_file_path="./my_new_bert_model.onnx"
    )
    print(f"ONNX file path: {onnx_file_path}")
    print(f"Input names: {input_names}")

    # Test the model on a few examples
    model.eval()
    test_texts = [
        "This movie was absolutely fantastic! I loved every minute of it.",
        "Terrible waste of time. The plot made no sense and the acting was awful."
    ]

    with torch.no_grad():
        inputs = tokenizer(
            test_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)
        model.to(device)
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=1)

        for text, pred in zip(test_texts, predictions):
            sentiment = "Positive" if pred[1] > pred[0] else "Negative"
            confidence = max(pred[0], pred[1]).item()
            print(f"\nText: {text}")
            print(f"Sentiment: {sentiment} (confidence: {confidence:.2%})")

# Call the main function
main(model, tokenizer)