In [5]:
!pip install transformers datasets rouge_score scikit-learn evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import os
import json
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import evaluate


In [7]:
from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def extract_contrast_violations_from_folder(folder_path):
    examples = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                for vp in data.get("viewports", []):
                    for c in vp.get("contrast", []):
                        role = c.get("role", "unknown")
                        fg = c.get("fg", [0, 0, 0])
                        bg = c.get("bg", [255, 255, 255])
                        contrast_val = c.get("contrast", 1.0)

                        fg_str = ",".join(map(str, fg))
                        bg_str = ",".join(map(str, bg))
                        input_str = f"role: {role}, fg: {fg_str}, bg: {bg_str}, contrast: {contrast_val:.2f}"
                        output_str = f"The {role} element has a foreground color of RGB({fg_str}) and background color of RGB({bg_str}) resulting in a contrast ratio of {contrast_val:.2f}."
                        if contrast_val < 4.5:
                            output_str += " This is below the WCAG recommended minimum of 4.5:1 for normal text."

                        examples.append({
                            "input": input_str,
                            "output": output_str
                        })
            except Exception as e:
                print(f"[ERROR] Failed to process {file_name}: {e}")
    return pd.DataFrame(examples)

In [9]:
from sklearn.model_selection import train_test_split

# Change this to your actual folder path
folder_path = "/content/drive/MyDrive/json_dataset_for_agents"
df = extract_contrast_violations_from_folder(folder_path)
print(f"Loaded {len(df)} contrast violation examples.")
df.head()

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Loaded 106636 contrast violation examples.
Train size: 85308, Test size: 21328


In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

def tokenize(batch):
    inputs = tokenizer(batch['input'], padding='max_length', truncation=True, max_length=64)
    targets = tokenizer(batch['output'], padding='max_length', truncation=True, max_length=64)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = targets.input_ids
    return batch

dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(tokenize, batched=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/85308 [00:00<?, ? examples/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./contrast_violator",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,6.0637
20,3.7111
30,2.5631
40,1.917
50,1.3762
60,1.0364
70,0.7933
80,0.5807
90,0.4115
100,0.3458


TrainOutput(global_step=31992, training_loss=0.007325785867487843, metrics={'train_runtime': 2967.2678, 'train_samples_per_second': 86.249, 'train_steps_per_second': 10.782, 'total_flos': 4329651899990016.0, 'train_loss': 0.007325785867487843, 'epoch': 3.0})

In [12]:
model.eval()

def generate_batch_descriptions(inputs, max_len=48, batch_size=16):
    results = []
    with torch.no_grad():
        for i in range(0, len(inputs), batch_size):
            batch = inputs[i:i+batch_size]
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(device)
            outputs = model.generate(**encoded, max_length=max_len)
            decoded = [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]
            results.extend(decoded)
    return results

inputs = test_df['input'].tolist()
references = test_df['output'].tolist()

predictions = generate_batch_descriptions(inputs)
rouge = evaluate.load("rouge")
rouge_result = rouge.compute(predictions=predictions, references=references)

exact_match = sum([p.strip() == r.strip() for p, r in zip(predictions, references)]) / len(references)

print(f"\n✅ ROUGE-1: {rouge_result['rouge1']:.4f}")
print(f"✅ ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match:.4f}")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


✅ ROUGE-1: 0.9701
✅ ROUGE-L: 0.9701
✅ Exact Match Accuracy: 0.7826


In [16]:
model.save_pretrained("./contrast_violator")
tokenizer.save_pretrained("./contrast_violator")


('./contrast_violator/tokenizer_config.json',
 './contrast_violator/special_tokens_map.json',
 './contrast_violator/spiece.model',
 './contrast_violator/added_tokens.json')

In [14]:
def generate_violation_description(input_str, max_len=64):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer.encode(input_str, return_tensors="pt", truncation=True, max_length=max_len).to(device)
        outputs = model.generate(input_ids, max_length=max_len)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [15]:
example = "role: link, fg: 120,120,120, bg: 255,255,255, contrast: 2.9"
print("Generated:", generate_violation_description(example))


Generated: The link element has a foreground color of RGB(120,120,120) and background color of RGB(255,255,255) resulting in a contrast ratio of 2.9. This is below the WCAG recommended minimum of 4.5:1 for normal text.


In [17]:
!zip -r contrast_violator.zip ./contrast_violator


  adding: contrast_violator/ (stored 0%)
  adding: contrast_violator/special_tokens_map.json (deflated 85%)
  adding: contrast_violator/spiece.model (deflated 48%)
  adding: contrast_violator/config.json (deflated 63%)
  adding: contrast_violator/model.safetensors (deflated 11%)
  adding: contrast_violator/added_tokens.json (deflated 83%)
  adding: contrast_violator/checkpoint-31992/ (stored 0%)
  adding: contrast_violator/checkpoint-31992/rng_state.pth (deflated 25%)
  adding: contrast_violator/checkpoint-31992/scheduler.pt (deflated 56%)
  adding: contrast_violator/checkpoint-31992/config.json (deflated 63%)
  adding: contrast_violator/checkpoint-31992/model.safetensors (deflated 11%)
  adding: contrast_violator/checkpoint-31992/training_args.bin (deflated 52%)
  adding: contrast_violator/checkpoint-31992/optimizer.pt (deflated 7%)
  adding: contrast_violator/checkpoint-31992/trainer_state.json (deflated 82%)
  adding: contrast_violator/checkpoint-31992/generation_config.json (deflat

In [18]:
!cp -r ./contrast_violator "/content/drive/My Drive/contrast_violator"


In [19]:
!pip install -q huggingface_hub
from huggingface_hub import notebook_login

notebook_login()  # This will prompt you to paste your HF token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
from huggingface_hub import create_repo, HfApi
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Change this to your desired repo name
repo_name = "contrast-violation-t5"

# Create the repo (set private=True if you want it private)
create_repo(repo_name, exist_ok=True)

# Push to hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/virajns2/contrast-violation-t5/commit/38b93f5e2852370733f7e3d9ade43bfe5be409c1', commit_message='Upload tokenizer', commit_description='', oid='38b93f5e2852370733f7e3d9ade43bfe5be409c1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/virajns2/contrast-violation-t5', endpoint='https://huggingface.co', repo_type='model', repo_id='virajns2/contrast-violation-t5'), pr_revision=None, pr_num=None)