# SmolVLM2 GUI Lab Notebook

This notebook mirrors the local repo structure for Colab usage. It loads AGUVIS datasets, previews samples, and runs zero-shot checks with `smolagents/SmolVLM2-2.2B-Instruct`.

Steps:
1. Mount Drive and clone the repo.
2. Install requirements.
3. Sample Stage-1/Stage-2/ScreenSpot examples.
4. Run the base model on selected prompts.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
REPO_DIR = '/content/VLM_Studies'
if not os.path.exists(REPO_DIR):
    !git clone https://github.com/PhilSaad333/VLM_Studies.git {REPO_DIR}
os.chdir(REPO_DIR)
print('Working dir:', os.getcwd())

In [None]:
!pip install -q -r requirements.txt

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))

from data_sources.aguvis.stage1 import load_stage1, STAGE1_CONFIGS
from data_sources.aguvis.stage2 import load_stage2, STAGE2_CONFIGS
from data_sources.screenspot import load_screenspot

print('Stage-1 configs:', STAGE1_CONFIGS)
print('Stage-2 configs:', STAGE2_CONFIGS[:5], '...')

## Stage-1 Sample

In [None]:
import itertools
from IPython.display import display

config = 'webui350k'
sample = next(iter(load_stage1(config, streaming=True)))
print('Config:', config)
print('User:', sample['user'])
print('Assistant:', sample['assistant'])
display(sample['image'])

## Stage-2 Sample

In [None]:
config = 'mind2web-l2'
sample2 = next(iter(load_stage2(config, streaming=True)))
print('Config:', config)
print('System:', sample2['system'][:200])
print('User:', sample2['user'])
print('Assistant:', sample2['assistant'][:400])
display(sample2['image'])

## ScreenSpot-v2 Sample

In [None]:
screenspot_sample = next(iter(load_screenspot(split='validation', streaming=True)))
print('Instruction:', screenspot_sample.instruction)
print('Source:', screenspot_sample.source)
display(screenspot_sample.draw_bbox())

## Load SmolVLM2-2.2B-Instruct

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch

MODEL_NAME = 'smolagents/SmolVLM2-2.2B-Instruct'
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map='auto')
model.eval()
print('Model loaded')

## Zero-shot Trial (Stage-1 instruction)

In [None]:
messages = [
    {
        'role': 'user',
        'content': [
            {'type': 'image', 'image': sample['image']},
            {'type': 'text', 'text': sample['user']}
        ]
    }
]
inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors='pt').to(model.device)
with torch.no_grad():
    generated = model.generate(**inputs, max_new_tokens=64)
output = processor.tokenizer.decode(generated[0], skip_special_tokens=True)
print(output)


## TODO 
- Extend with Stage-2 multi-step prompts.
- Log outputs to Drive for qualitative comparison.
- Integrate bounding-box tool once available.