# Quantization

In [17]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install pyJoules
!pip install codecarbon
!pip install zeus-ml
!pip install bitsandbytes
!pip install -Uqqq
!pip install --upgrade accelerate
!pip -qqq install bitsandbytes accelerate
!pip install optimum-quanto
!pip install quanto


[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
Collecting quanto
  Downloading quanto-0.2.0-py3-none-any.whl.metadata (10 kB)
Downloading quanto-0.2.0-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: quanto
Successfully installed quanto-0.2.0


In [23]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, DistilBertForSequenceClassification, AutoModel, DistilBertModel, AutoModelForSequenceClassification, QuantoConfig
from datasets import load_dataset
from functools import partial
from zeus.monitor import ZeusMonitor
import numpy as np
from torch.utils.data import DataLoader
import evaluate
from torch.quantization import quantize_dynamic
import bitsandbytes as bnb
from optimum.quanto import QuantizedModelForCausalLM, qint4, quantize, qint8

In [1]:

class SimpleEnergyMonitor:
    def __init__(self, model):
        self.model = model
        self.zeus_monitor = ZeusMonitor(approx_instant_energy=True, gpu_indices=[torch.cuda.current_device()])

        self.component_energy = {
            'attention': [],
            'ffn': [],
            'embeddings': []
        }

        # Access the underlying DistilBERT model for embedding and transformer layers
        distilbert_model = self.model.distilbert

        # Register hooks for component-level energy tracking
        distilbert_model.embeddings.register_forward_hook(
            partial(self._energy_hook, 'embeddings')
        )

        for layer in distilbert_model.transformer.layer:
            layer.attention.register_forward_hook(
                partial(self._energy_hook, 'attention')
            )
            layer.ffn.register_forward_hook(
                partial(self._energy_hook, 'ffn')
            )

    def _energy_hook(self, component_name, module, input, output):
        torch.cuda.synchronize()
        self.zeus_monitor.begin_window(component_name)
        energy = self.zeus_monitor.end_window(component_name).total_energy
        self.component_energy[component_name].append(energy)

    def measure(self, input_ids, attention_mask=None):
        # Reset energy measurements
        for component in self.component_energy:
            self.component_energy[component].clear()

        self.zeus_monitor.begin_window('inference')

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)

        total_energy = self.zeus_monitor.end_window('inference').total_energy

        num_tokens = input_ids.shape[1]
        results = {
            'total_energy': total_energy,
            'energy_per_token': total_energy / num_tokens,
            'energy_per_inference': total_energy,
            'components': {
                'attention': np.sum(self.component_energy['attention']),
                'ffn': np.sum(self.component_energy['ffn']),
                'embeddings': np.sum(self.component_energy['embeddings'])
            }
        }

        return outputs.logits, results

def run_glue_energy_monitoring(model, tokenizer, dataset_name="sst2", batch_size=16):
    # Load the GLUE dataset and metric
    dataset = load_dataset("glue", dataset_name)
    metric = evaluate.load("glue", dataset_name)

    # Tokenize and prepare data for the model
    def preprocess_function(examples):
        return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128)

    encoded_dataset = dataset.map(preprocess_function, batched=True)
    encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # Create DataLoader for batch processing
    dataloader = DataLoader(encoded_dataset['validation'], batch_size=batch_size)

    monitor = SimpleEnergyMonitor(model)

    total_energy_consumption = []
    component_energy_sum = {'attention': 0, 'ffn': 0, 'embeddings': 0}
    all_preds = []
    all_labels = []

    for batch in dataloader:
        input_ids = batch['input_ids'].to('cuda')  # Ensure input is on the Cuda
        attention_mask = batch['attention_mask'].to('cuda')  # Ensure attention mask is on the Cuda
        labels = batch['label'].to('cuda')  # Ensure labels are on the Cuda

        logits, energy_metrics = monitor.measure(input_ids, attention_mask=attention_mask)
        total_energy_consumption.append(energy_metrics['total_energy'])

        # Convert logits to predicted class labels
        preds = torch.argmax(logits, dim=-1)
        #all_preds.extend(preds.numpy())
        #all_labels.extend(labels.numpy())
        all_preds.extend(preds)
        all_labels.extend(labels)


        # Accumulate component energy
        for component, energy in energy_metrics['components'].items():
            component_energy_sum[component] += energy

        # Print detailed metrics for each batch
        print("\nEnergy Metrics (Joules):")
        print(f"Total Energy: {energy_metrics['total_energy']:.4f}")
        print(f"Energy per Token: {energy_metrics['energy_per_token']:.4f}")
        print(f"Energy per Inference: {energy_metrics['energy_per_inference']:.4f}")
        print("\nComponent Breakdown:")
        for component, energy in energy_metrics['components'].items():
            print(f"{component.title()}: {energy:.4f}")

    # Calculate average energy per inference
    avg_energy_per_inference = np.mean(total_energy_consumption)
    print(f"\nAverage energy consumption for {dataset_name} task: {avg_energy_per_inference:.4f} Joules")

    print("\nTotal Component Breakdown for all batches:")
    for component, energy in component_energy_sum.items():
        print(f"{component.title()}: {energy:.4f}")

    # Calculate GLUE score
    metric_result = metric.compute(predictions=all_preds, references=all_labels)
    print("\nGLUE Score:")
    for key, value in metric_result.items():
        print(f"{key}: {value:.4f}")

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model name
    model_name = "distilbert-base-uncased"

    # Load model in 8-bit
    model = AutoModelForSequenceClassification.from_pretrained(
      model_name
    ).to(device)

    quantize(model, weights=qint8, activations=qint8)

    # Load tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    # Run energy monitoring and evaluation on a GLUE task (e.g., SST-2)
    run_glue_energy_monitoring(model, tokenizer, dataset_name="sst2")


NameError: name 'torch' is not defined

# Code Carbon

In [3]:
!export CODECARBON_GPU_IDS="0"
!pip uninstall -y pynvml nvidia-ml-py3 nvidia-ml-py
!pip cache purge
!pip install nvidia-ml-py

[0mFound existing installation: nvidia-ml-py 12.560.30
Uninstalling nvidia-ml-py-12.560.30:
  Successfully uninstalled nvidia-ml-py-12.560.30
Files removed: 6
Collecting nvidia-ml-py
  Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)
Downloading nvidia_ml_py-12.560.30-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-ml-py
Successfully installed nvidia-ml-py-12.560.30


In [10]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
from datasets import load_dataset
from functools import partial
from zeus.monitor import ZeusMonitor
import numpy as np
from torch.utils.data import DataLoader
from codecarbon import track_emissions, EmissionsTracker

class SimpleEmissionsMonitor:
    def __init__(self, model):
        self.model = model
        self.tracker = EmissionsTracker(
            output_dir="./code_carbon/",  # define the directory to which we'll write our emissions results
            output_file="emissions.csv",  # define the name of the file containing our emissions results
            log_level='error' # comment out this line to see regular output
        )

        self.component_emissions = {
            'attention': [],
            'ffn': [],
            'embeddings': []
        }
        self.component_energy = {
            'attention': [],
            'ffn': [],
            'embeddings': []
        }


        # Register hooks for component-level emission tracking
        self.model.embeddings.register_forward_hook(
            partial(self._emissions_hook, 'embeddings')
        )

        for layer in self.model.transformer.layer:
            layer.attention.register_forward_hook(
                partial(self._emissions_hook, 'attention')
            )
            layer.ffn.register_forward_hook(
                partial(self._emissions_hook, 'ffn')
            )

    def _emissions_hook(self, component_name, module, input, output):
        torch.cuda.synchronize()
        self.tracker.start_task(component_name)
        emissions = self.tracker.stop_task()
        #print(emissions)
        self.component_emissions[component_name].append(emissions.emissions*1000000)
        self.component_energy[component_name].append(emissions.energy_consumed*1000)

    def measure(self, input_ids, model):
        # Reset emission measurements
        for component in self.component_emissions:
            self.component_emissions[component].clear()
            self.component_energy[component].clear()

        with torch.no_grad():
            output = self.model(input_ids)

        self.tracker.start_task('inference')

        with torch.no_grad():
            output = model(input_ids)

        results = self.tracker.stop_task('inference')
        #print(results)
        #total_emissions = results.emissions * 1000000
        #total_energy = results.energy_consumed * 1000
        total_emissions = results.emissions*1000000+np.sum(self.component_emissions['attention']) + np.sum(self.component_emissions['ffn']) + np.sum(self.component_emissions['embeddings'])
        total_energy = results.energy_consumed*1000+np.sum(self.component_energy['attention']) + np.sum(self.component_energy['ffn']) + np.sum(self.component_energy['embeddings'])


        num_tokens = input_ids.shape[1]
        results = {
            'total_emissions': total_emissions,
            'emissions_per_token': total_emissions / num_tokens,
            'emissions_per_inference': total_emissions,
            'emission_components': {
                'attention': np.sum(self.component_emissions['attention']),
                'ffn': np.sum(self.component_emissions['ffn']),
                'embeddings': np.sum(self.component_emissions['embeddings'])
            },
            'total_energy': total_energy,
            'energy_per_token': total_energy / num_tokens,
            'energy_per_inference': total_energy,
            'energy_components': {
                'attention': np.sum(self.component_energy['attention']),
                'ffn': np.sum(self.component_energy['ffn']),
                'embeddings': np.sum(self.component_energy['embeddings'])
            }
        }

        return results

def run_glue_emissions_monitoring(model, tokenizer, dataset_name="sst2", batch_size=16):
    dataset = load_dataset("glue", dataset_name)

    # Tokenize and prepare data for the model
    def preprocess_function(examples):
        return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128)

    encoded_dataset = dataset.map(preprocess_function, batched=True)
    encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    # Create DataLoader for batch processing
    dataloader = DataLoader(encoded_dataset['validation'], batch_size=batch_size)

    monitor = SimpleEmissionsMonitor(model)

    total_emissions_consumption = []
    component_emissions_sum = {'attention': 0, 'ffn': 0, 'embeddings': 0}
    total_energy_consumption = []
    component_energy_sum = {'attention': 0, 'ffn': 0, 'embeddings': 0}

    for batch in dataloader:
        input_ids = batch['input_ids'].to('cuda')
        emissions_metrics = monitor.measure(input_ids,model)
        total_emissions_consumption.append(emissions_metrics['total_emissions'])
        total_energy_consumption.append(emissions_metrics['total_energy'])

        # Accumulate component emissions
        for component, emissions in emissions_metrics['emission_components'].items():
            component_emissions_sum[component] += emissions
        for component, energy in emissions_metrics['energy_components'].items():
            component_energy_sum[component] += energy

        # Print detailed metrics for each batch
        print("\nEmissions Metrics (mg of Carbon):")
        print(f"Total Emissions: {emissions_metrics['total_emissions']:.4f}")
        print(f"Emissions per Token: {emissions_metrics['emissions_per_token']:.4f}")
        print(f"Emissions per Inference: {emissions_metrics['emissions_per_inference']:.4f}")
        print("\nEmissions Component Breakdown:")
        for component, emissions in emissions_metrics['emission_components'].items():
            print(f"{component.title()}: {emissions:.4f}")

        print("\nEnergy Metrics (Joules):")
        print(f"Total Energy: {emissions_metrics['total_energy']:.4f}")
        print(f"Energy per Token: {emissions_metrics['energy_per_token']:.4f}")
        print(f"Energy per Inference: {emissions_metrics['energy_per_inference']:.4f}")
        print("\nEnergy Component Breakdown:")
        for component, emissions in emissions_metrics['energy_components'].items():
            print(f"{component.title()}: {emissions:.4f}")
    monitor.tracker.stop()

    avg_emissions_per_inference = np.mean(total_emissions_consumption)
    tot_emissions_per_inference = np.sum(total_emissions_consumption)
    print(f"\nTotal emissions for {dataset_name} task: {tot_emissions_per_inference:.4f} mg of Carbon")
    print(f"\nAverage emissions for {dataset_name} task: {avg_emissions_per_inference:.4f} mg of Carbon")
    print("\nTotal Emissions Component Breakdown for all batches:")
    for component, emissions in component_emissions_sum.items():
        print(f"{component.title()}: {emissions:.4f}")

    avg_energy_per_inference = np.mean(total_energy_consumption)
    tot_energy_per_inference = np.sum(total_energy_consumption)
    print(f"\nTotal energy for {dataset_name} task: {tot_energy_per_inference:.4f} Joules")
    print(f"\nAverage energy for {dataset_name} task: {avg_energy_per_inference:.4f} Joules")
    print("\nTotal Energy Component Breakdown for all batches:")
    for component, emissions in component_energy_sum.items():
        print(f"{component.title()}: {emissions:.4f}")

if __name__ == "__main__":
    # Load pre-trained model and tokenizer
    model = DistilBertModel.from_pretrained("distilbert-base-uncased").to('cuda')
    model.eval()
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    # Run emissions monitoring on a GLUE task (e.g., SST-2)
    run_glue_emissions_monitoring(model, tokenizer, dataset_name="sst2")





Emissions Metrics (mg of Carbon):
Total Emissions: 4.2192
Emissions per Token: 0.0330
Emissions per Inference: 4.2192

Emissions Component Breakdown:
Attention: 1.7833
Ffn: 1.2964
Embeddings: 0.6256

Energy Metrics (Joules):
Total Energy: 0.0098
Energy per Token: 0.0001
Energy per Inference: 0.0098

Energy Component Breakdown:
Attention: 0.0041
Ffn: 0.0030
Embeddings: 0.0015

Emissions Metrics (mg of Carbon):
Total Emissions: 3.2911
Emissions per Token: 0.0257
Emissions per Inference: 3.2911

Emissions Component Breakdown:
Attention: 1.8177
Ffn: 0.7795
Embeddings: 0.1706

Energy Metrics (Joules):
Total Energy: 0.0077
Energy per Token: 0.0001
Energy per Inference: 0.0077

Energy Component Breakdown:
Attention: 0.0042
Ffn: 0.0018
Embeddings: 0.0004

Emissions Metrics (mg of Carbon):
Total Emissions: 4.1088
Emissions per Token: 0.0321
Emissions per Inference: 4.1088

Emissions Component Breakdown:
Attention: 1.2491
Ffn: 1.6869
Embeddings: 0.6062

Energy Metrics (Joules):
Total Energy: 0.

  df = pd.concat(
