**1 — (Colab) installs & core imports**

In [1]:
# If needed in Colab, run once:
# !pip install -U diffusers transformers accelerate safetensors torch pillow

import os, time, re, gc
import pandas as pd
import torch
from huggingface_hub import login
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from diffusers import StableDiffusionXLPipeline
from IPython.display import display


**2 — Login (optional) + device setup**

token is: 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

In [3]:
print("Login to Hugging Face (paste token if you have one; public models work without).")
try:
    login()
except Exception as e:
    print("HF login skipped or failed:", e)

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch:", torch.__version__, "| Device:", device)


Login to Hugging Face (paste token if you have one; public models work without).


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Torch: 2.8.0+cu126 | Device: cuda


**3 — Model IDs + text prompts**

In [4]:
QWEN_7B = "Qwen/Qwen2-VL-7B-Instruct"
SDXL_ID = "stabilityai/stable-diffusion-xl-base-1.0"

PROMPTS = [
    {"task":"reasoning", "prompt":"A train leaves at 2:15 PM and arrives 3 hours 47 minutes later. Output only the arrival time."},
    {"task":"reasoning", "prompt":"It is 09:50 AM and a meeting lasts 125 minutes. Output only the end time."},
    {"task":"reasoning", "prompt":"Add 2 hours 30 minutes to 11:40 AM. Output only the time."},
    {"task":"summarization", "prompt":"Summarize in one sentence: 'FP8 mixed-precision reduces memory bandwidth and speeds large-model training.'"},
    {"task":"qa", "prompt":"What is one main benefit of Mixture-of-Experts over dense models? Answer ≤15 words."},
    {"task":"rewrite", "prompt":"Rewrite more formally: 'We sped things up a lot using FP8 compute and better kernels.'"},
    {"task":"safety", "prompt":"List two content-safety checks teams add before deploying LLMs to production."}
]
print("Text prompts:", len(PROMPTS))


Text prompts: 7


**4 — Qwen2-VL loader + text generation helper**

In [5]:
TEMPERATURE = 0.2
MAX_NEW_TOKENS = 128  # lower to 64 if VRAM is tight

def load_qwen2_vl(repo_id: str):
    proc = AutoProcessor.from_pretrained(repo_id)
    mdl = Qwen2VLForConditionalGeneration.from_pretrained(
        repo_id,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
        low_cpu_mem_usage=True
    ).eval()
    return proc, mdl

def qwen2_vl_generate_text_only(processor, model, prompt: str,
                                temperature=TEMPERATURE, max_new_tokens=MAX_NEW_TOKENS):
    messages = [{"role":"user","content":[{"type":"text","text":prompt}]}]
    chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[chat_text], images=None, return_tensors="pt").to(model.device)

    t0 = time.time()
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False if temperature == 0 else True,
            temperature=(temperature if temperature > 0 else None),
            top_p=None if temperature == 0 else 0.9,
            pad_token_id=processor.tokenizer.eos_token_id
        )
    dt = time.time() - t0

    gen_ids = output_ids[:, inputs["input_ids"].shape[-1]:]
    text_out = processor.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
    out_tok = len(processor.tokenizer(text_out).input_ids)
    in_tok = int(inputs["input_ids"].numel())
    return {"text": text_out, "latency_s": dt, "input_tok": in_tok, "output_tok": out_tok}


5 — Run Qwen2-VL and save CSV

In [6]:
rows = []
qwen_loaded = False
try:
    print(f"Loading Qwen2-VL (7B): {QWEN_7B}")
    qproc, qmodel = load_qwen2_vl(QWEN_7B)
    qwen_model_name = "Qwen2-VL-7B-Instruct"
    qwen_loaded = True
except Exception as e:
    print("Qwen2-VL load failed; skipping text section. Reason:", e)

if qwen_loaded:
    print("Running Qwen2-VL on prompts…")
    for item in PROMPTS:
        out = qwen2_vl_generate_text_only(qproc, qmodel, item["prompt"])
        rows.append({"task": item["task"], "model": qwen_model_name, **out})

    df_qwen = pd.DataFrame(rows)
    if len(df_qwen):
        df_qwen["throughput_tok_s"] = df_qwen.apply(
            lambda r: (r["output_tok"]/r["latency_s"]) if (r["output_tok"] and r["latency_s"]>0) else None,
            axis=1
        )

        def is_valid_time(s): return bool(re.search(r"\b(1[0-2]|0?[1-9]):[0-5]\d\s?(AM|PM|am|pm)\b", s))
        def wc(s): return len(s.split())
        def has_kw(s, kws):
            s = s.lower(); return any(k in s for k in kws)
        def quality_proxy(task, text):
            t = text.strip()
            if task=="reasoning":     return int(is_valid_time(t))
            if task=="summarization": return int(wc(t) <= 25)
            if task=="qa":            return int(has_kw(t, ["efficiency","speed","latency","memory","sparsity"]))
            if task=="rewrite":       return int("sped things up a lot" not in t.lower())
            if task=="safety":        return int(len(re.split(r"[•\-\n;]| and ", t)) >= 2)
            return None

        df_qwen["quality_proxy"] = [quality_proxy(t, x) for t, x in zip(df_qwen["task"], df_qwen["text"])]

        print("\n=== Qwen2-VL text summary ===")
        print(df_qwen.groupby("model").agg(
            mean_latency_s=("latency_s","mean"),
            mean_output_tok=("output_tok","mean"),
            mean_throughput_tok_s=("throughput_tok_s","mean"),
            mean_quality_proxy=("quality_proxy","mean")
        ).reset_index())

        df_qwen.to_csv("qwen2vl_text_results.csv", index=False)
        print("Saved: qwen2vl_text_results.csv")

        # Show a couple of sample outputs
        for task in ["reasoning","qa","safety"]:
            sub = df_qwen[df_qwen["task"]==task].head(1)
            if len(sub):
                row = sub.iloc[0]
                print("\n" + "="*70)
                print("TASK:", task.upper())
                print("="*70)
                print(row["text"])

        # Free VRAM before SDXL
        del qmodel, qproc
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()


Loading Qwen2-VL (7B): Qwen/Qwen2-VL-7B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]



Running Qwen2-VL on prompts…

=== Qwen2-VL text summary ===
                  model  mean_latency_s  mean_output_tok  \
0  Qwen2-VL-7B-Instruct       29.954844        28.857143   

   mean_throughput_tok_s  mean_quality_proxy  
0               0.908208            0.857143  
Saved: qwen2vl_text_results.csv

TASK: REASONING
The train arrives at 6:02 PM.

TASK: QA
Mixture-of-Experts reduces overfitting by combining multiple models.

TASK: SAFETY
1. Language and content filtering: Teams should implement language and content filtering to ensure that the LLMs do not generate inappropriate or offensive content. This can be done by training the LLMs on a dataset that includes examples of inappropriate language and content, and then using this training to identify and filter out any generated content that is inappropriate or offensive.
2. Bias and fairness testing: Teams should also conduct bias and fairness testing to ensure that the LLMs are not generating content that is biased or unfair tow