# MobileLLM Scaling Laws Study

Reproducible assessment of MobileLLM family (125M to 1B) to analyze scaling behavior in sub-billion parameter regime.

**Run on Google Colab with GPU runtime (T4 recommended)**

## What this notebook does:
1. Loads MobileLLM models (125M, 350M, 600M, 1B)
2. Computes perplexity on WikiText-2
3. Runs downstream tasks via lm-evaluation-harness
4. Tracks system metrics (tokens/sec, memory, wall-clock)
5. Fits scaling curves and generates plots
6. Exports results to JSON/CSV

## 1. Setup

In [None]:
# Install dependencies (run once)
!pip install -q transformers datasets accelerate sentencepiece lm-eval torch
!pip install -q scipy scikit-learn matplotlib seaborn pandas tqdm psutil

In [None]:
# Check GPU availability
import torch

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected. Go to Runtime > Change runtime type > GPU")

In [None]:
# Authenticate with HuggingFace (required for gated MobileLLM models)
from huggingface_hub import login
login()  # Enter your HF token when prompted

In [None]:
# Core imports
import os
import json
import time
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from tqdm.auto import tqdm
import psutil

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Create results directory
RESULTS_DIR = Path('results')
RESULTS_DIR.mkdir(exist_ok=True)

## 2. Configuration

In [None]:
# MobileLLM model configurations
MODELS = {
    "MobileLLM-125M": {
        "hf_name": "facebook/MobileLLM-125M",
        "params": 124.6e6,
        "layers": 30,
        "hidden": 576,
    },
    "MobileLLM-350M": {
        "hf_name": "facebook/MobileLLM-350M",
        "params": 345.3e6,
        "layers": 32,
        "hidden": 960,
    },
    "MobileLLM-600M": {
        "hf_name": "facebook/MobileLLM-600M",
        "params": 603.1e6,
        "layers": 40,
        "hidden": 1152,
    },
    "MobileLLM-1B": {
        "hf_name": "facebook/MobileLLM-1B",
        "params": 1.01e9,
        "layers": 54,
        "hidden": 1280,
    },
}

# Downstream tasks (matching MobileLLM paper)
DOWNSTREAM_TASKS = [
    "arc_easy",
    "arc_challenge",
    "boolq",
    "piqa",
    "hellaswag",
    "winogrande",
    "openbookqa",
]

In [None]:
@dataclass
class AssessmentResults:
    """Container for assessment results."""
    model_name: str
    num_params: float
    perplexity: float
    downstream_scores: Dict[str, float]
    tokens_per_second: float
    peak_memory_mb: float
    wall_clock_seconds: float
    
    def to_dict(self):
        return asdict(self)

## 3. Perplexity (Sliding Window)

In [None]:
def compute_perplexity_sliding_window(
    model,
    tokenizer,
    dataset_name: str = "wikitext",
    dataset_config: str = "wikitext-2-raw-v1",
    split: str = "test",
    max_length: int = 1024,
    stride: int = 512,
    device: str = "cuda",
) -> Dict:
    """
    Compute perplexity using HuggingFace's recommended sliding window approach.
    
    This avoids edge artifacts by using overlapping windows and only
    computing NLL on the non-overlapping portion.
    
    Reference: https://huggingface.co/docs/transformers/perplexity
    """
    # Load dataset
    dataset = load_dataset(dataset_name, dataset_config, split=split)
    
    # Concatenate all text
    text = "\n\n".join(dataset["text"])
    encodings = tokenizer(text, return_tensors="pt")
    
    seq_len = encodings.input_ids.size(1)
    print(f"Dataset tokens: {seq_len:,}")
    
    nlls = []
    prev_end_loc = 0
    
    model.eval()
    start_time = time.time()
    
    with torch.no_grad():
        for begin_loc in tqdm(range(0, seq_len, stride), desc="Computing perplexity"):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc  # Only score non-overlapping portion
            
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
            target_ids = input_ids.clone()
            
            # Mask tokens we've already scored
            target_ids[:, :-trg_len] = -100
            
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss * trg_len
            nlls.append(neg_log_likelihood)
            
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break
    
    wall_time = time.time() - start_time
    total_nll = torch.stack(nlls).sum()
    perplexity = torch.exp(total_nll / seq_len).item()
    
    return {
        "perplexity": perplexity,
        "total_tokens": seq_len,
        "wall_clock_seconds": wall_time,
        "tokens_per_second": seq_len / wall_time,
    }

## 4. System Metrics

In [None]:
def get_gpu_memory_mb() -> float:
    """Get current GPU memory usage in MB."""
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / 1e6
    return 0.0


def reset_memory_stats():
    """Reset GPU memory stats."""
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.empty_cache()


def measure_inference_speed(
    model,
    tokenizer,
    prompt: str = "The quick brown fox",
    num_tokens: int = 50,
    num_runs: int = 5,
    device: str = "cuda",
) -> Dict:
    """Measure inference speed in tokens/second."""
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Warmup
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=10, use_cache=False)
    
    times = []
    for _ in range(num_runs):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        start = time.perf_counter()
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=num_tokens,
                do_sample=False,
                use_cache=False,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        elapsed = time.perf_counter() - start
        times.append(elapsed)
    
    avg_time = np.mean(times)
    tokens_per_second = num_tokens / avg_time
    
    return {
        "tokens_per_second": tokens_per_second,
        "avg_time_seconds": avg_time,
        "std_time_seconds": np.std(times),
    }

## 5. Downstream Tasks (lm-harness)

In [None]:
def run_lm_harness_tasks(
    model_name: str,
    tasks: List[str],
    batch_size: int = 8,
) -> Dict[str, float]:
    """
    Run lm-harness using Python API directly.
    """
    from lm_eval import evaluator
    from lm_eval.models.huggingface import HFLM
    
    # Create model wrapper
    lm = HFLM(
        pretrained=model_name,
        trust_remote_code=True,
        batch_size=batch_size,
    )
    
    # Run assessment
    results = evaluator.simple_evaluate(
        model=lm,
        tasks=tasks,
        batch_size=batch_size,
    )
    
    # Extract scores
    scores = {}
    for task, metrics in results.get("results", {}).items():
        acc = metrics.get("acc,none") or metrics.get("acc_norm,none")
        if acc is not None:
            scores[task] = acc
    
    return scores

## 6. Run Full Assessment

In [None]:
def assess_model(model_name: str, model_config: Dict, device: str = "cuda") -> AssessmentResults:
    """
    Run full assessment suite on a single model.
    """
    print(f"\n{'='*60}")
    print(f"Assessing: {model_name}")
    print(f"{'='*60}")
    
    hf_name = model_config["hf_name"]
    start_time = time.time()
    reset_memory_stats()
    
    # Load model and tokenizer
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(hf_name, use_fast=False)
    tokenizer.add_special_tokens({"eos_token": "</s>", "bos_token": "<s>", "unk_token": "<unk>"})
    
    model = AutoModelForCausalLM.from_pretrained(
        hf_name,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    
    actual_params = sum(p.numel() for p in model.parameters())
    print(f"Parameters: {actual_params:,}")
    
    # 1. Perplexity
    print("\nComputing perplexity...")
    ppl_results = compute_perplexity_sliding_window(
        model, tokenizer, device=device
    )
    print(f"Perplexity: {ppl_results['perplexity']:.2f}")
    
    # 2. Inference speed
    print("\nMeasuring inference speed...")
    speed_results = measure_inference_speed(model, tokenizer, device=device)
    print(f"Tokens/sec: {speed_results['tokens_per_second']:.1f}")
    
    # 3. Downstream tasks
    print("\nRunning downstream tasks...")
    downstream_scores = run_lm_harness_tasks(hf_name, DOWNSTREAM_TASKS)
    
    for task, score in downstream_scores.items():
        print(f"  {task}: {score*100:.1f}%")
    
    # Collect results
    wall_clock = time.time() - start_time
    peak_memory = get_gpu_memory_mb()
    
    results = AssessmentResults(
        model_name=model_name,
        num_params=actual_params,
        perplexity=ppl_results["perplexity"],
        downstream_scores=downstream_scores,
        tokens_per_second=speed_results["tokens_per_second"],
        peak_memory_mb=peak_memory,
        wall_clock_seconds=wall_clock,
    )
    
    # Save individual model results
    model_dir = RESULTS_DIR / model_name.lower().replace("-", "_")
    model_dir.mkdir(exist_ok=True)
    
    with open(model_dir / "results.json", "w") as f:
        json.dump(results.to_dict(), f, indent=2)
    
    # Cleanup
    del model
    del tokenizer
    torch.cuda.empty_cache()
    
    return results

In [None]:
# Run assessment on all models
all_results = []

for model_name, model_config in MODELS.items():
    try:
        results = assess_model(model_name, model_config)
        all_results.append(results)
    except Exception as e:
        print(f"Failed to assess {model_name}: {e}")

print(f"\nCompleted {len(all_results)}/{len(MODELS)} models")

## 7. Scaling Analysis

In [None]:
# Convert results to DataFrame
df = pd.DataFrame([r.to_dict() for r in all_results])

# Expand downstream scores
downstream_df = pd.json_normalize(df["downstream_scores"])
df = pd.concat([df.drop(columns=["downstream_scores"]), downstream_df], axis=1)

# Calculate average downstream accuracy
task_cols = [c for c in df.columns if c in DOWNSTREAM_TASKS]
df["avg_accuracy"] = df[task_cols].mean(axis=1)

df

In [None]:
def fit_scaling_law(x, y, name=""):
    """
    Fit log-linear scaling law: log(y) = alpha * log(x) + beta
    Returns slope (alpha), intercept (beta), R-squared, and confidence intervals.
    """
    log_x = np.log10(x)
    log_y = np.log10(y)
    
    slope, intercept, r_value, p_value, std_err = stats.linregress(log_x, log_y)
    
    print(f"\n{name} Scaling Law:")
    print(f"  y = {10**intercept:.4f} * x^{slope:.4f}")
    print(f"  R^2 = {r_value**2:.4f}")
    print(f"  Slope SE = {std_err:.4f}")
    
    return {
        "slope": slope,
        "intercept": intercept,
        "r_squared": r_value**2,
        "std_err": std_err,
        "p_value": p_value,
    }

In [None]:
# Fit scaling laws
params = df["num_params"].values

# Perplexity scaling (expect negative slope - larger models = lower perplexity)
ppl_fit = fit_scaling_law(params, df["perplexity"].values, "Perplexity")

# Accuracy scaling (expect positive slope - larger models = higher accuracy)
acc_fit = fit_scaling_law(params, df["avg_accuracy"].values, "Average Accuracy")

## 8. Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Perplexity vs Parameters
ax1 = axes[0, 0]
ax1.scatter(df["num_params"], df["perplexity"], s=100, zorder=5)
for i, row in df.iterrows():
    ax1.annotate(row["model_name"], (row["num_params"], row["perplexity"]),
                 xytext=(5, 5), textcoords="offset points", fontsize=8)

# Fit line
x_fit = np.logspace(np.log10(params.min()), np.log10(params.max()), 100)
y_fit = 10**ppl_fit["intercept"] * x_fit**ppl_fit["slope"]
ax1.plot(x_fit, y_fit, 'r--', alpha=0.7, label=f"Fit: a={ppl_fit['slope']:.3f}")

ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_xlabel("Parameters")
ax1.set_ylabel("Perplexity (WikiText-2)")
ax1.set_title("Perplexity Scaling")
ax1.legend()

# Plot 2: Accuracy vs Parameters
ax2 = axes[0, 1]
ax2.scatter(df["num_params"], df["avg_accuracy"] * 100, s=100, zorder=5)
for i, row in df.iterrows():
    ax2.annotate(row["model_name"], (row["num_params"], row["avg_accuracy"] * 100),
                 xytext=(5, 5), textcoords="offset points", fontsize=8)

ax2.set_xscale("log")
ax2.set_xlabel("Parameters")
ax2.set_ylabel("Average Accuracy (%)")
ax2.set_title("Downstream Task Scaling")

# Plot 3: Per-task breakdown
ax3 = axes[1, 0]
task_data = df.melt(
    id_vars=["model_name", "num_params"],
    value_vars=task_cols,
    var_name="task",
    value_name="accuracy"
)
sns.barplot(data=task_data, x="task", y="accuracy", hue="model_name", ax=ax3)
ax3.set_ylabel("Accuracy")
ax3.set_title("Per-Task Performance")
ax3.tick_params(axis="x", rotation=45)
ax3.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")

# Plot 4: Efficiency (accuracy per billion params)
ax4 = axes[1, 1]
df["efficiency"] = df["avg_accuracy"] / (df["num_params"] / 1e9)
bars = ax4.bar(df["model_name"], df["efficiency"])
ax4.set_ylabel("Accuracy per Billion Params")
ax4.set_title("Parameter Efficiency")
ax4.tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.savefig(RESULTS_DIR / "scaling_curves.png", dpi=150, bbox_inches="tight")
plt.show()

## 9. Export Results

In [None]:
# Save combined results
df.to_csv(RESULTS_DIR / "scaling_results.csv", index=False)

# Save scaling law fits
scaling_fits = {
    "perplexity": ppl_fit,
    "accuracy": acc_fit,
}
with open(RESULTS_DIR / "scaling_fits.json", "w") as f:
    json.dump(scaling_fits, f, indent=2)

print(f"Results saved to {RESULTS_DIR}/")
print(f"  - scaling_results.csv")
print(f"  - scaling_fits.json")
print(f"  - scaling_curves.png")

In [None]:
# Generate markdown report
report = f"""# MobileLLM Scaling Laws Report

## Summary

Assessed {len(all_results)} MobileLLM models (125M to 1B parameters) on:
- WikiText-2 perplexity
- {len(DOWNSTREAM_TASKS)} downstream tasks

## Scaling Law Fits

### Perplexity
- **Formula:** PPL = {10**ppl_fit['intercept']:.2f} x N^{ppl_fit['slope']:.3f}
- **R^2:** {ppl_fit['r_squared']:.4f}

### Average Accuracy
- **Formula:** Acc = {10**acc_fit['intercept']:.4f} x N^{acc_fit['slope']:.3f}
- **R^2:** {acc_fit['r_squared']:.4f}

## Model Results

| Model | Params | Perplexity | Avg Acc | Tokens/s | Memory (MB) |
|-------|--------|------------|---------|----------|-------------|
"""

for _, row in df.iterrows():
    report += f"| {row['model_name']} | {row['num_params']/1e6:.0f}M | {row['perplexity']:.2f} | {row['avg_accuracy']*100:.1f}% | {row['tokens_per_second']:.0f} | {row['peak_memory_mb']:.0f} |\n"

report += """
## Files

- `scaling_results.csv` - Raw results
- `scaling_fits.json` - Fitted scaling law parameters
- `scaling_curves.png` - Visualization
"""

with open(RESULTS_DIR / "report.md", "w") as f:
    f.write(report)

print("Report saved to results/report.md")
print("\n" + "="*60)
print(report)