# Model Pruning Analysis

**L1 Unstructured Pruning with 50% Sparsity**

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch import nn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
import torch.nn.utils.prune as prune
import evaluate
import wandb
import time
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

# Set environment variables to force CPU usage and disable MPS
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

print("Environment configured for CPU-only execution")
print("Please restart the kernel and run all cells from the beginning")


Environment configured for CPU-only execution
Please restart the kernel and run all cells from the beginning


## Installs


In [3]:
%pip install -q evaluate
%pip install -q emoji==0.6.0
%pip install -q torch
%pip install -q transformers
%pip install -q accelerate


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Model Selection

**IMPORTANT:** Before running the notebook, modify the `MODEL_TYPE` flag in the next cell to select your desired model:
- Set `MODEL_TYPE = "roberta"` for RoBERTa model
- Set `MODEL_TYPE = "bert"` for BERTweet model

This is your responsibility to ensure the correct model and paths are configured.


In [4]:
# Model selection flag
MODEL_TYPE = "roberta"   # Change to "roberta" for RoBERTa

# Model configuration based on flag
if MODEL_TYPE == "roberta":
    model_name = "cardiffnlp/twitter-roberta-base-sentiment"
    best_model_path = "./HuggingFace/roberta/best_roberta_model_so_far"
    model_file = "model_roberta.pt"
elif MODEL_TYPE == "bert":
    model_name = "finiteautomata/bertweet-base-sentiment-analysis"
    best_model_path = "./Full model/bert/best_bert_model_so_far"
    model_file = "model_bert.pt"
else:
    raise ValueError(f"Unsupported model type: {MODEL_TYPE}")

print(f"Using {MODEL_TYPE.upper()} model: {model_name}")
print(f"Model path: {best_model_path}")
print(f"Model file: {model_file}")


Using ROBERTA model: cardiffnlp/twitter-roberta-base-sentiment
Model path: ./HuggingFace/roberta/best_roberta_model_so_far
Model file: model_roberta.pt


In [5]:
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # Fallback to CPU for unsupported ops
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'  # Disable MPS

# Force CPU device
device = torch.device("cpu")
print(f"Using device: {device}")

# Disable MPS backend if available (Apple Silicon)
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    torch.backends.mps.is_built = lambda: False
    print("MPS backend disabled - using CPU only")


Using device: cpu
MPS backend disabled - using CPU only


In [6]:
print('logging to wandb.ai account')
wandb.login(key="6dd13a6018f089606e418d323dd8b502f31bca4e")

logging to wandb.ai account


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/taltzafrir/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mttzafrir[0m ([33mat-bay-data-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Reading Raw Data


In [7]:
train = pd.read_csv("OOT_train.csv", encoding='latin-1')
val = pd.read_csv("OOT_val.csv", encoding='latin-1')
test = pd.read_csv("OOT_test.csv", encoding='latin-1')


### Preprocessing


In [8]:
#encoding the labels numerically from Sentiment
ordinal_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# map to ordinal labels
train["ordinal_label_id"] = train["Sentiment"].map(ordinal_mapping)
val["ordinal_label_id"] = val["Sentiment"].map(ordinal_mapping)
test["ordinal_label_id"] = test["Sentiment"].map(ordinal_mapping)


In [9]:
# Function to build the input string from multiple columns
def build_augmented_input(row):
    parts = []

    if pd.notna(row.get('clean_tweet')):
        parts.append(f"{row['clean_tweet']}")

    if pd.notna(row.get('Location_standardized')) and row['Location_standardized'].lower() != 'unknown':
        parts.append(f"{row['Location_standardized']}")

    if pd.notna(row.get('TweetAt')):
        parts.append(f"{row['TweetAt']}")

    return " | ".join(parts)

# Apply to the DataFrames
train['model_input'] = train.apply(build_augmented_input, axis=1)
val['model_input'] = val.apply(build_augmented_input, axis=1)
test['model_input'] = test.apply(build_augmented_input, axis=1)

# Create new DataFrames with only what's needed for modeling
formatted_train = train[['model_input', 'ordinal_label_id']].copy()
formatted_val = val[['model_input', 'ordinal_label_id']].copy()
formatted_test = test[['model_input', 'ordinal_label_id']].copy()


In [10]:
# Balance dataset by undersampling
def balance_dataset(df, target_samples_per_class=5000):
    balanced_dfs = []
    print("Original class distribution:")
    print(df['ordinal_label_id'].value_counts().sort_index())
    
    for class_id in range(5):
        class_data = df[df['ordinal_label_id'] == class_id]
        if len(class_data) > target_samples_per_class:
            class_data = class_data.sample(n=target_samples_per_class, random_state=42)
            print(f"Class {class_id}: {len(class_data)} samples (undersampled)")
        else:
            print(f"Class {class_id}: {len(class_data)} samples (kept all)")
        balanced_dfs.append(class_data)
    
    balanced_df = pd.concat(balanced_dfs, ignore_index=True).sample(frac=1, random_state=42)
    print(f"Balanced dataset: {len(balanced_df)} total samples")
    return balanced_df

formatted_train = balance_dataset(formatted_train, target_samples_per_class=5000)


Original class distribution:
ordinal_label_id
0     5175
1     9230
2     6784
3    10140
4     5845
Name: count, dtype: int64
Class 0: 5000 samples (undersampled)
Class 1: 5000 samples (undersampled)
Class 2: 5000 samples (undersampled)
Class 3: 5000 samples (undersampled)
Class 4: 5000 samples (undersampled)
Balanced dataset: 25000 total samples


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(data, max_length=128):
    return tokenizer(
        data['model_input'].tolist(),
        truncation=True,
        padding=False,
        max_length=max_length,
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False
    )

train_encodings = tokenize_data(formatted_train)
val_encodings = tokenize_data(formatted_val)
test_encodings = tokenize_data(formatted_test)


In [12]:
## define a PyTorch Dataset
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert labels to integers
train_labels = formatted_train['ordinal_label_id'].tolist()
val_labels = formatted_val['ordinal_label_id'].tolist()
test_labels = formatted_test['ordinal_label_id'].tolist()

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings, test_labels)


In [13]:
def compute_detailed_metrics(eval_pred):
    """Enhanced metrics using HuggingFace Evaluate library"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Load HuggingFace metrics
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average='macro'))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average='macro'))

    # Custom ordinal metrics
    results['mae'] = np.mean(np.abs(predictions - labels))
    results['adjacent_accuracy'] = np.sum(np.abs(predictions - labels) <= 1) / len(labels)

    # Quadratic Weighted Kappa
    from sklearn.metrics import cohen_kappa_score
    try:
        qwk = cohen_kappa_score(labels, predictions, weights='quadratic')
        results['quadratic_weighted_kappa'] = qwk
    except:
        results['quadratic_weighted_kappa'] = 0.0

    return results


### Apply L1 Unstructured Pruning

In [14]:
# Load the original model
print("Loading the best model from hyperparameter tuning...")
original_model = torch.load(os.path.join(best_model_path, model_file), map_location=device, weights_only=False)
original_model.eval()
print("Model loaded successfully!")

# Create a copy for pruning
print("Applying L1 unstructured pruning (50% sparsity)...")
pruned_model = torch.load(os.path.join(best_model_path, model_file), map_location='cpu', weights_only=False)
pruned_model = pruned_model.to('cpu')
pruned_model.eval()

# Apply L1 unstructured pruning to all Linear and Embedding layers
parameters_to_prune = []
for name, module in pruned_model.named_modules():
    if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):
        parameters_to_prune.append((module, 'weight'))

# Apply global L1 unstructured pruning with 50% sparsity
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.3,  # 50% sparsity
)

# Make pruning permanent by removing the pruning masks
for module, param_name in parameters_to_prune:
    prune.remove(module, param_name)

print("Model pruning complete!")


Loading the best model from hyperparameter tuning...
Model loaded successfully!
Applying L1 unstructured pruning (50% sparsity)...
Model pruning complete!


In [15]:
def get_model_size(model):
    """Calculate model size in MB"""
    param_size = sum(param.nelement() * param.element_size() for param in model.parameters())
    buffer_size = sum(buffer.nelement() * buffer.element_size() for buffer in model.buffers())
    return (param_size + buffer_size) / (1024 * 1024)

def get_effective_model_size(model):
    """Calculate effective model size excluding pruned (zero) parameters"""
    param_size = sum(torch.count_nonzero(param).item() * param.element_size() for param in model.parameters())
    buffer_size = sum(buffer.nelement() * buffer.element_size() for buffer in model.buffers())
    return (param_size + buffer_size) / (1024 * 1024)

# Compare model sizes
original_size = get_model_size(original_model)
pruned_size = get_model_size(pruned_model)
effective_pruned_size = get_effective_model_size(pruned_model)
compression_ratio = original_size / effective_pruned_size

print(f"\\nModel Size Comparison:")
print(f"Original Model Size: {original_size:.2f} MB")
print(f"Pruned Model Size (with zeros): {pruned_size:.2f} MB")
print(f"Effective Pruned Model Size: {effective_pruned_size:.2f} MB")
print(f"Compression Ratio: {compression_ratio:.2f}x")
print(f"Size Reduction: {(1 - effective_pruned_size/original_size) * 100:.1f}%")


\nModel Size Comparison:
Original Model Size: 475.51 MB
Pruned Model Size (with zeros): 475.51 MB
Effective Pruned Model Size: 237.99 MB
Compression Ratio: 2.00x
Size Reduction: 50.0%


In [16]:
def measure_inference_time(model, tokenizer, sample_text, num_runs=50):
    """Measure average inference time for a model"""
    model.eval()
    times = []
    
    # Warm-up runs
    for _ in range(5):
        inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            _ = model(**inputs)
    
    # Actual timing runs
    for _ in range(num_runs):
        inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        start_time = time.time()
        with torch.no_grad():
            _ = model(**inputs)
        end_time = time.time()
        times.append(end_time - start_time)
    
    return np.mean(times), np.std(times)

# Test inference speed
sample_tweet = "COVID-19 vaccines have been crucial in reducing hospitalizations and saving lives worldwide."

print("Measuring inference speed...")
original_time, original_std = measure_inference_time(original_model, tokenizer, sample_tweet)
pruned_time, pruned_std = measure_inference_time(pruned_model, tokenizer, sample_tweet)

speedup = original_time / pruned_time

print(f"\\nInference Speed Comparison:")
print(f"Original Model: {original_time*1000:.2f} ± {original_std*1000:.2f} ms")
print(f"Pruned Model: {pruned_time*1000:.2f} ± {pruned_std*1000:.2f} ms")
print(f"Speedup: {speedup:.2f}x")


Measuring inference speed...
\nInference Speed Comparison:
Original Model: 45.58 ± 6.09 ms
Pruned Model: 44.68 ± 6.04 ms
Speedup: 1.02x


In [17]:
# Initialize W&B for Pruning Results
wandb.init(
    project="covid-tweet-sentiment-pruning",
    name=f"{MODEL_TYPE}-l1-unstructured-pruning",
    config={
        "model_type": f"{MODEL_TYPE}-base",
        "pruning_method": "l1_unstructured",
        "sparsity_level": 0.5,
        "device": "cpu",
        "original_model_size_mb": original_size,
        "pruned_model_size_mb": effective_pruned_size,
        "compression_ratio": compression_ratio,
        "inference_speedup": speedup
    }
)


In [18]:
def evaluate_model(model, dataset, model_name):
    """Evaluate a model on a given dataset"""
    model = model.to('cpu')
    model.eval()
    
    # Configure multi-core inference
    num_cores = os.cpu_count()
    num_threads = max(1, num_cores // 2)
    torch.set_num_threads(num_threads)
    print(f"Using {num_threads} threads for inference (out of {num_cores} available cores)")
    
    # Create trainer for evaluation
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./temp",
            per_device_eval_batch_size=32, 
            remove_unused_columns=False,
            use_cpu=True,
            dataloader_num_workers=0,
            dataloader_pin_memory=False,
        ),
        processing_class=tokenizer,
        compute_metrics=compute_detailed_metrics,
    )
    
    results = trainer.evaluate(dataset)
    
    # Format results with model name prefix
    formatted_results = {}
    for key, value in results.items():
        new_key = f"{model_name}_{key}"
        formatted_results[new_key] = value
    
    return results, formatted_results

# Evaluate both models
print("Evaluating Original Model on Validation Set...")
val_results_original, val_formatted_original = evaluate_model(original_model, val_dataset, "original")

print("\\nEvaluating Pruned Model on Validation Set...")
val_results_pruned, val_formatted_pruned = evaluate_model(pruned_model, val_dataset, "pruned")

print("\\nEvaluating Original Model on Test Set...")
test_results_original, test_formatted_original = evaluate_model(original_model, test_dataset, "original")

print("\\nEvaluating Pruned Model on Test Set...")
test_results_pruned, test_formatted_pruned = evaluate_model(pruned_model, test_dataset, "pruned")

# Log to wandb
wandb.log({
    **val_formatted_original,
    **val_formatted_pruned,
    **test_formatted_original,
    **test_formatted_pruned
})


Evaluating Original Model on Validation Set...
Using 6 threads for inference (out of 12 available cores)


\nEvaluating Pruned Model on Validation Set...
Using 6 threads for inference (out of 12 available cores)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


\nEvaluating Original Model on Test Set...
Using 6 threads for inference (out of 12 available cores)


\nEvaluating Pruned Model on Test Set...
Using 6 threads for inference (out of 12 available cores)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
### Comprehensive Results Summary

print("="*80)
print("PRUNING RESULTS SUMMARY")
print("="*80)

print(f"\\n📊 MODEL COMPRESSION:")
print(f"Original Model Size: {original_size:.2f} MB")
print(f"Pruned Model Size (with zeros): {pruned_size:.2f} MB")
print(f"Effective Pruned Model Size: {effective_pruned_size:.2f} MB")
print(f"Compression Ratio: {compression_ratio:.2f}x")
print(f"Size Reduction: {(1 - effective_pruned_size/original_size) * 100:.1f}%")

print(f"\\n⚡ INFERENCE SPEED:")
print(f"Original Model: {original_time*1000:.2f} ms")
print(f"Pruned Model: {pruned_time*1000:.2f} ms")
print(f"Speedup: {speedup:.2f}x")

print(f"\\n🎯 VALIDATION SET PERFORMANCE:")
print(f"                      Original    Pruned      Difference")
print(f"Accuracy:             {val_results_original['eval_accuracy']:.4f}      {val_results_pruned['eval_accuracy']:.4f}      {val_results_pruned['eval_accuracy'] - val_results_original['eval_accuracy']:+.4f}")
print(f"F1-Score:             {val_results_original['eval_f1']:.4f}      {val_results_pruned['eval_f1']:.4f}      {val_results_pruned['eval_f1'] - val_results_original['eval_f1']:+.4f}")
print(f"QWK:                  {val_results_original['eval_quadratic_weighted_kappa']:.4f}      {val_results_pruned['eval_quadratic_weighted_kappa']:.4f}      {val_results_pruned['eval_quadratic_weighted_kappa'] - val_results_original['eval_quadratic_weighted_kappa']:+.4f}")

print(f"\\n🧪 TEST SET PERFORMANCE:")
print(f"                      Original    Pruned      Difference")
print(f"Accuracy:             {test_results_original['eval_accuracy']:.4f}      {test_results_pruned['eval_accuracy']:.4f}      {test_results_pruned['eval_accuracy'] - test_results_original['eval_accuracy']:+.4f}")
print(f"F1-Score:             {test_results_original['eval_f1']:.4f}      {test_results_pruned['eval_f1']:.4f}      {test_results_pruned['eval_f1'] - test_results_original['eval_f1']:+.4f}")
print(f"QWK:                  {test_results_original['eval_quadratic_weighted_kappa']:.4f}      {test_results_pruned['eval_quadratic_weighted_kappa']:.4f}      {test_results_pruned['eval_quadratic_weighted_kappa'] - test_results_original['eval_quadratic_weighted_kappa']:+.4f}")

# Calculate performance retention
acc_retention = (test_results_pruned['eval_accuracy'] / test_results_original['eval_accuracy']) * 100
f1_retention = (test_results_pruned['eval_f1'] / test_results_original['eval_f1']) * 100
qwk_retention = (test_results_pruned['eval_quadratic_weighted_kappa'] / test_results_original['eval_quadratic_weighted_kappa']) * 100

print(f"\\n🎯 PERFORMANCE RETENTION:")
print(f"Accuracy Retention: {acc_retention:.1f}%")
print(f"F1-Score Retention: {f1_retention:.1f}%")
print(f"QWK Retention: {qwk_retention:.1f}%")

print(f"\\n💡 PRUNING SUMMARY:")
efficiency_score = (compression_ratio * speedup) / max(1, abs(test_results_pruned['eval_f1'] - test_results_original['eval_f1']) * 100)
print(f"• Achieved {compression_ratio:.1f}x model compression with {speedup:.1f}x inference speedup")
print(f"• Performance degradation: {abs(test_results_pruned['eval_f1'] - test_results_original['eval_f1']):.4f} F1-score points")
print(f"• Efficiency Score: {efficiency_score:.1f} (higher is better)")

# Final wandb log with summary metrics
wandb.log({
    "final_compression_ratio": compression_ratio,
    "final_speedup": speedup,
    "final_f1_retention": f1_retention,
    "final_accuracy_retention": acc_retention,
    "final_qwk_retention": qwk_retention,
    "efficiency_score": efficiency_score
})

print(f"\\n✅ Pruning analysis complete! Results logged to W&B.")
wandb.finish()


PRUNING RESULTS SUMMARY
\n📊 MODEL COMPRESSION:
Original Model Size: 475.51 MB
Pruned Model Size (with zeros): 475.51 MB
Effective Pruned Model Size: 237.99 MB
Compression Ratio: 2.00x
Size Reduction: 50.0%
\n⚡ INFERENCE SPEED:
Original Model: 45.58 ms
Pruned Model: 44.68 ms
Speedup: 1.02x
\n🎯 VALIDATION SET PERFORMANCE:
                      Original    Pruned      Difference
Accuracy:             0.8393      0.5761      -0.2633
F1-Score:             0.8412      0.4154      -0.4258
QWK:                  0.9297      0.7411      -0.1886
\n🧪 TEST SET PERFORMANCE:
                      Original    Pruned      Difference
Accuracy:             0.8338      0.5818      -0.2520
F1-Score:             0.8369      0.4200      -0.4169
QWK:                  0.9248      0.7469      -0.1779
\n🎯 PERFORMANCE RETENTION:
Accuracy Retention: 69.8%
F1-Score Retention: 50.2%
QWK Retention: 80.8%
\n💡 PRUNING SUMMARY:
• Achieved 2.0x model compression with 1.0x inference speedup
• Performance degradation: 0.41

0,1
efficiency_score,▁
eval/accuracy,█▁█▁
eval/adjacent_accuracy,█▂▇▁
eval/f1,█▁█▁
eval/loss,▁█▁█
eval/mae,▁█▁█
eval/model_preparation_time,██▁█
eval/precision,█▁█▁
eval/quadratic_weighted_kappa,█▁█▁
eval/recall,█▁█▁

0,1
efficiency_score,0.04889
eval/accuracy,0.58178
eval/adjacent_accuracy,0.93721
eval/f1,0.42
eval/loss,1.60302
eval/mae,0.4854
eval/model_preparation_time,0.0022
eval/precision,0.37226
eval/quadratic_weighted_kappa,0.74686
eval/recall,0.49473
