# SmolLM + Sidecar GAN (Verifier-Guided Generation) Demo

This Colab notebook loads a **SmolLM** model and wraps it with a **Sidecar** verifier that scores and critiques candidate answers.

**What you'll see**
1. Baseline zero-shot answers from SmolLM
2. **Rerank** with a structured Sidecar score vector
3. **Edit→Evaluate** loop using targeted critiques (error vectors)
4. **Adversarial mining** to harden the Sidecar against Goodharting

Models: default = `HuggingFaceTB/SmolLM2-360M-Instruct` (fast). You can switch to `SmolLM2-1.7B-Instruct` (heavier) or `SmolLM-135M-Instruct` (smol).

In [None]:
%%capture
!pip -q install --upgrade transformers accelerate torch --index-url https://download.pytorch.org/whl/cu121
!pip -q install bitsandbytes==0.43.1 || true

In [None]:
#@title Select model and runtime config
MODEL_ID = 'HuggingFaceTB/SmolLM2-360M-Instruct'  #@param ['HuggingFaceTB/SmolLM2-360M-Instruct', 'HuggingFaceTB/SmolLM2-1.7B-Instruct', 'HuggingFaceTB/SmolLM-135M-Instruct']
USE_8BIT = False  #@param {type:'boolean'}
MAX_NEW_TOKENS = 64  #@param {type:'integer'}
TEMPERATURE = 0.7     #@param {type:'number'}
TOP_P = 0.9           #@param {type:'number'}
K_CANDIDATES = 6      #@param {type:'integer'}
EDIT_MAX_ITERS = 2    #@param {type:'integer'}
SIDEcar_ACCEPT = 0.80 #@param {type:'number'}
RERANK_MIN = 0.55     #@param {type:'number'}
SEED = 123
import torch, random
torch.manual_seed(SEED); random.seed(SEED)
print('Using model:', MODEL_ID)

Using model: HuggingFaceTB/SmolLM2-360M-Instruct


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = 'cuda' if torch.cuda.is_available() else 'cpu'
kwargs = {}
if USE_8BIT:
    kwargs.update(dict(load_in_8bit=True, device_map='auto'))
else:
    kwargs.update(dict(torch_dtype=torch.float16 if device=='cuda' else torch.float32, device_map='auto'))

tok = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **kwargs)
print('Loaded on', device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 5177dd82-f8ab-49c5-84be-5e539ea09aeb)')' thrown while requesting HEAD https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct/resolve/main/generation_config.json
Retrying in 1s [Retry 1/5].


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Loaded on cuda


In [None]:
from dataclasses import dataclass
import re
from typing import List

class HFGenerator:
    def __init__(self, model, tok, max_new_tokens=64, temperature=0.7, top_p=0.9):
        self.model = model
        self.tok = tok
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
    def _build_prompt(self, question: str, critique: str=None):
        base = ("You are a precise assistant. If the question is arithmetic, reply with JUST the final integer.\n\n" f"Question: {question}\nAnswer:")
        if critique:
            base += f"\n\n# Feedback to address: {critique}\nRevised answer:"
        return base
    def _generate(self, prompt: str, n: int):
        inputs = self.tok(prompt, return_tensors='pt').to(self.model.device)
        out = self.model.generate(**inputs, max_new_tokens=self.max_new_tokens, do_sample=True, temperature=self.temperature, top_p=self.top_p, num_return_sequences=n, pad_token_id=self.tok.eos_token_id)
        texts = self.tok.batch_decode(out, skip_special_tokens=True)
        cleaned = []
        for t in texts:
            if 'Revised answer:' in t:
                cleaned.append(t.split('Revised answer:')[-1].strip())
            elif 'Answer:' in t:
                cleaned.append(t.split('Answer:')[-1].strip())
            else:
                cleaned.append(t.strip())
        return cleaned
    def propose(self, question: str, n: int=1):
        return self._generate(self._build_prompt(question), n)
    def edit(self, question: str, current: str, critique: str):
        prompt = self._build_prompt(question, critique=critique)
        outs = self._generate(prompt, 1)
        return outs[0]

gen = HFGenerator(model, tok, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, top_p=TOP_P)

In [None]:
from dataclasses import dataclass
from typing import Dict, Optional
import re

@dataclass
class SidecarResult:
    score_vector: Dict[str,float]
    scalar: float
    uncertain: bool
    critique: str
    meta: Dict[str,str]

class RuleSidecar:
    def __init__(self, expect_number=True):
        self.expect_number = expect_number
    @staticmethod
    def _extract_int(s: str) -> Optional[int]:
        s = s.strip()
        if re.fullmatch(r"-?\d+", s):
            return int(s)
        ints = re.findall(r"-?\d+", s)
        return int(ints[-1]) if ints else None
    @staticmethod
    def _parse_sum(prompt: str) -> Optional[int]:
        m = re.search(r"(\d+)\s*\+\s*(\d+)", prompt)
        if not m: return None
        a,b = map(int, m.groups())
        return a+b
    def score(self, prompt: str, answer: str) -> SidecarResult:
        vec = {}; notes = {}
        num = self._extract_int(answer)
        if self.expect_number:
            vec['format_correctness'] = 1.0 if num is not None else 0.0
            if num is None: notes['format']='Expected a numeric final answer.'
        else:
            vec['format_correctness'] = 1.0
        tgt = self._parse_sum(prompt)
        if tgt is not None and num is not None:
            vec['arithmetic_correctness'] = 1.0 if num==tgt else 0.0
            if num!=tgt: notes['math']=f'Sum {tgt} expected but model said {num}.'
        else:
            vec['arithmetic_correctness'] = 0.0 if tgt is not None else 0.5
            if tgt is not None and num is None:
                notes['math']=f'Could not find a numeric answer; expected {tgt}.'
        fluff = 1.0 if len(answer)>60 else 0.0
        vec['logic_step_1'] = 1.0 - 0.5*fluff
        vec['safety'] = 1.0
        w={'format_correctness':0.25,'arithmetic_correctness':0.55,'logic_step_1':0.15,'safety':0.05}
        agg = sum(vec[k]*w[k] for k in w)
        disagree = (vec['format_correctness']>0.7 and vec['arithmetic_correctness']<0.3) or (vec['format_correctness']<0.3 and vec['arithmetic_correctness']>0.7)
        uncertain = disagree or (0.35 <= agg <= 0.65)
        tips=[]
        if vec['format_correctness']<1.0: tips.append('Return a bare integer as the final answer.')
        if vec['arithmetic_correctness']<1.0 and tgt is not None: tips.append(f'The computed sum should be {tgt}. Recalculate carefully.')
        if vec['logic_step_1']<1.0: tips.append('Be concise; avoid unnecessary filler.')
        critique = ' '.join(tips) if tips else 'Looks good.'
        return SidecarResult(vec, agg, uncertain, critique, notes)

sidecar = RuleSidecar()

In [None]:
from dataclasses import dataclass
from typing import Tuple, List

@dataclass
class RerankConfig:
    k:int=6; min_sidecar:float=0.55; prefer_confident:bool=True

def rerank_with_sidecar(gen, sidecar, prompt:str, cfg:RerankConfig):
    cands = gen.propose(prompt, n=cfg.k)
    scored = [(y, sidecar.score(prompt,y)) for y in cands]
    def utility(sr):
        u = sr.scalar
        if cfg.prefer_confident and sr.uncertain: u -= 0.05
        return u
    best_y, best_sr = max(scored, key=lambda t: utility(t[1]))
    return best_y, best_sr, scored

@dataclass
class EditEvalConfig:
    max_iters:int=2; accept_threshold:float=0.8

def edit_evaluate(gen, sidecar, prompt:str, initial:str, cfg:EditEvalConfig):
    y = initial; sr = sidecar.score(prompt, y); steps=0
    while steps<cfg.max_iters and sr.scalar < cfg.accept_threshold:
        y = gen.edit(prompt, y, sr.critique)
        sr = sidecar.score(prompt, y)
        steps += 1
    return y, sr, steps

In [None]:
prompts = [
    'What is 17 + 8? Return just the number.',
    'Compute 45+12',
    'Please add 101 + 99.',
    'Add 7 + 13, give the final number only.'
]
rr_cfg = RerankConfig(k=K_CANDIDATES, min_sidecar=RERANK_MIN)
ee_cfg = EditEvalConfig(max_iters=EDIT_MAX_ITERS, accept_threshold=SIDEcar_ACCEPT)
for q in prompts:
    y0, sr0, scored = rerank_with_sidecar(gen, sidecar, q, rr_cfg)
    print(f"Q: {q}\n  Rerank -> {y0!r} | sidecar={sr0.scalar:.2f} uncertain={sr0.uncertain} vec={sr0.score_vector}")
    if sr0.scalar < rr_cfg.min_sidecar:
        y1, sr1, steps = edit_evaluate(gen, sidecar, q, y0, ee_cfg)
        print(f"  EditEval x{steps} -> {y1!r} | sidecar={sr1.scalar:.2f} uncertain={sr1.uncertain} vec={sr1.score_vector}")
    else:
        print('  Accepted after rerank.')

Q: What is 17 + 8? Return just the number.
  Rerank -> '' | sidecar=0.20 uncertain=False vec={'format_correctness': 0.0, 'arithmetic_correctness': 0.0, 'logic_step_1': 1.0, 'safety': 1.0}
  EditEval x1 -> '25' | sidecar=1.00 uncertain=False vec={'format_correctness': 1.0, 'arithmetic_correctness': 1.0, 'logic_step_1': 1.0, 'safety': 1.0}
Q: Compute 45+12
  Rerank -> '' | sidecar=0.20 uncertain=False vec={'format_correctness': 0.0, 'arithmetic_correctness': 0.0, 'logic_step_1': 1.0, 'safety': 1.0}
  EditEval x1 -> '57' | sidecar=1.00 uncertain=False vec={'format_correctness': 1.0, 'arithmetic_correctness': 1.0, 'logic_step_1': 1.0, 'safety': 1.0}
Q: Please add 101 + 99.
  Rerank -> '' | sidecar=0.20 uncertain=False vec={'format_correctness': 0.0, 'arithmetic_correctness': 0.0, 'logic_step_1': 1.0, 'safety': 1.0}
  EditEval x2 -> 'The computed sum should be 200. Recalculate carefully.' | sidecar=1.00 uncertain=False vec={'format_correctness': 1.0, 'arithmetic_correctness': 1.0, 'logic_st

In [None]:
import random, re

def true_sum(q):
    m = re.search(r"(\d+)\s*\+\s*(\d+)", q)
    return (int(m.group(1)) + int(m.group(2))) if m else None

def random_addition_prompt():
    a,b = random.randint(1,199), random.randint(1,199)
    style = random.choice([
        f'What is {a} + {b}? Return just the number.',
        f'Compute {a}+{b}',
        f'Add {a} + {b}, give the final number only.'
    ])
    return style

def evaluate(n=20):
    rr_cfg = RerankConfig(k=K_CANDIDATES, min_sidecar=RERANK_MIN)
    ee_cfg = EditEvalConfig(max_iters=EDIT_MAX_ITERS, accept_threshold=SIDEcar_ACCEPT)
    base_ok=sidecar_ok=0
    for _ in range(n):
        q = random_addition_prompt(); tgt = true_sum(q)
        tmp = gen.temperature; gen.temperature = max(0.2, TEMPERATURE-0.3)
        base = gen.propose(q, n=1)[0]
        gen.temperature = tmp
        base_num = RuleSidecar._extract_int(base)
        if base_num == tgt: base_ok+=1
        y0, sr0, _ = rerank_with_sidecar(gen, sidecar, q, rr_cfg)
        if sr0.scalar < rr_cfg.min_sidecar:
            y1, sr1, _ = edit_evaluate(gen, sidecar, q, y0, ee_cfg)
            cand = y1
        else:
            cand = y0
        cand_num = RuleSidecar._extract_int(cand)
        if cand_num == tgt: sidecar_ok+=1
    return base_ok/n, sidecar_ok/n

base_acc, sidecar_acc = evaluate(30)
print(f'Baseline acc: {base_acc:.2%}\nSidecar acc:  {sidecar_acc:.2%}\nDelta: {sidecar_acc-base_acc:.2%}')

Baseline acc: 0.00%
Sidecar acc:  86.67%
Delta: 86.67%


In [None]:
def adversarial_mine(prompts, rounds=1, top_k=3):
    mined = []
    for _ in range(rounds):
        for q in prompts:
            cands = gen.propose(q, n=12)
            scored = [(y, sidecar.score(q,y)) for y in cands]
            scored.sort(key=lambda t: t[1].scalar, reverse=True)
            tgt = RuleSidecar._parse_sum(q)
            kept=0
            for y,sr in scored:
                num = RuleSidecar._extract_int(y)
                if tgt is not None and num is not None and num!=tgt:
                    mined.append({'q':q,'y':y,'sidecar':sr.scalar,'true':tgt})
                    kept+=1
                    if kept>=top_k: break
    return mined

samples = adversarial_mine([
    'Compute 123+77', 'What is 55 + 46?', 'Add 19 + 31, final number only.'
])
print('Adversarial samples (high sidecar but wrong):')
for s in samples[:5]:
    print(s)

Adversarial samples (high sidecar but wrong):


> **Note**: If the 1.7B model is slow or OOMs, switch to `SmolLM2-360M-Instruct` or `SmolLM-135M-Instruct` in the selector above.

This demo uses a simple rule-based Sidecar for arithmetic. To extend it:
- Replace `RuleSidecar` with a learned classifier (multi-head, uncertainty-aware).
- Add retrieval checks for factual tasks.
- Add JSON/schema validators for tool-using agents.
