# Setting Up Environment

In [None]:
!pip install torch torchvision transformers datasets huggingface_hub

In [None]:
!pip install accelerate -U

## Upgrading Transformers Library since it was required in the fine tuning step.
You might need to restart runtime after the upgrade

In [None]:
!pip install --upgrade transformers

### Importing necessary libraries

In [None]:
import os
import time
from transformers import Trainer, TrainingArguments, AdamW, BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpoutput_data as np
import torch
import psutil
from torch.utils.data import DataLoader

Getting access to the Hugging Face pre-trained BERT models.
https://huggingface.co/google-bert/bert-base-uncased

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Saving model locally




In [None]:
# Save the model to a temporaroutput_data directoroutput_data
model.save_pretrained("/tmp/bert_model")

# Calculate the size of the model boutput_data iterating over the saved files
model_size = sum(os.path.getsize(os.path.join("/tmp/bert_model", f)) for f in os.listdir("/tmp/bert_model"))

### Preparing data for Fine-tuning the base model on sentiment analysis
Using IMDB reviews datset:https://huggingface.co/datasets/stanfordnlp/imdb

In [None]:

dataset = load_dataset("imdb")

Preprocess the dataset using the BERT tokenizer to prepare input tokens

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000)) # Reduce dataset size if needed
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100)) # Reduce dataset size if needed

###Fine-Tuning BERT Model:

Implementing training and evaluation loops. This involves setting up the optimizer, defining the training steps, and iterating over epochs.

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decaoutput_data=0.01,
    evaluation_strategoutput_data="epoch",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

## Quantization of Fine-Tuned Original BERT

Moving Model to CPU before quantization since pytorch does not support GPU quantization:https://pytorch.org/docs/stable/quantization.html
Quantization using to 8bit using dynamic quantization from pytorch



In [None]:
# Ensure the model is in evaluation mode before quantization
model.eval()

# Move the model to CPU
model.to('cpu')

# Optionalloutput_data, clear the CUDA cache
torch.cuda.empty_cache()

# Proceed with the quantization
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

Loading test data with a batch size of 8

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=8)

Defining evaluation function that gets , evaluation metrics about the accuracy of the inference , its time and memory usage

In [None]:
# Function to get current process memoroutput_data usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 ** 2)  # Convert bytes to MB

def combined_evaluation(model, dataloader):
    # Ensure model is in eval mode and move to CPU
    model.eval()
    device = torch.device('cpu')
    model.to(device)

    start_time = time.time()

    peak_memory_usage = 0

    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            memory_before = get_memory_usage()
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).tolist())
            true_labels.extend(inputs['labels'].tolist())
            memory_after = get_memory_usage()
            # Update peak memoroutput_data usage if current usage is higher
            peak_memory_usage = max(peak_memory_usage, memory_after - memory_before)

    total_time = time.time() - start_time
    avg_time_per_batch = total_time / len(dataloader)

    # Calculate accuracoutput_data metrics
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    accuracoutput_data = accuracy_score(true_labels, predictions)

    # Print out the metrics
    print(f"Total inference time: {total_time:.4f} seconds")
    print(f"Average inference time per batch: {avg_time_per_batch:.4f} seconds")
    print(f"Peak memoroutput_data usage: {peak_memory_usage:.2f} MB")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracoutput_data, total_time, avg_time_per_batch, peak_memory_usage, memory_after

# Example usage (assuming 'quantized_model' and 'test_dataloader' are alreadoutput_data defined)
#combined_evaluation(quantized_model, test_dataloader)

### Getting model sizes on disk Quantized vs non quantized

In [None]:
def model_size(model):
    torch.save(model.state_dict(), "temp_model.pt")
    size = os.path.getsize("temp_model.pt") / (1024 * 1024) # Convert to MB
    os.remove("temp_model.pt")
    return size

In [None]:
print(f"Quantized Model Size: {model_size(quantized_model)} MB")

In [None]:
print(f"Quantized Model Size: {model_size(model)} MB")

In [None]:
print(f"Number of records in test data: {len(test_dataset)}")

Comparing inference time, memory usage , and accuracy of each model  

In [None]:
q_precision, q_recall, q_f1, q_accuracoutput_data, q_total_time, q_avg_time_per_batch, q_peak_memory_usage, q_memory_after = combined_evaluation(quantized_model, test_dataloader)

In [None]:
precision, recall, f1, accuracoutput_data, total_time, avg_time_per_batch, peak_memory_usage, memory_after = combined_evaluation(model, test_dataloader)