# Brand Ads OCR — DeepSeek‑OCR 

This notebook fetches ad images (Apple, Samsung, Huawei, Yamaha), runs **DeepSeek‑OCR** via the official Hugging Face Transformers path, and produces a table with predicted **brand** and a short **entity** blurb.

**References:** Model card & repo usage for DeepSeek‑OCR (tested env + API).

- Hugging Face model: `deepseek-ai/DeepSeek-OCR`  
- GitHub README (**Transformers-Inference** section): uses `AutoTokenizer/AutoModel(..., trust_remote_code=True)` and `model.infer(...)` with prompts like `"<image>\nFree OCR."`.

> GPU strongly recommended (CUDA 11.8). CPU may be slow/unsupported.


## 0) Install (run on your machine / Colab)
Pinned versions per model card/README. Adjust CUDA wheel if needed.

## 1) Setup: paths & helpers

In [None]:
import os, re, io, csv, time, json, glob, pathlib
from typing import Optional
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image

ROOT = pathlib.Path('.')
DATA = ROOT / 'data'
IMAGES = DATA / 'images'
OUT = DATA / 'outputs'
OCR_JSON = OUT / 'ocr_json'
for p in [DATA, IMAGES, OUT, OCR_JSON]:
    p.mkdir(parents=True, exist_ok=True)

def slug(s: str) -> str:
    return re.sub(r'[^a-zA-Z0-9]+', '_', s).strip('_')

def og_image(page_url: str) -> Optional[str]:
    try:
        html = requests.get(page_url, timeout=20).text
        soup = BeautifulSoup(html, 'html.parser')
        tag = soup.find('meta', property='og:image')
        if tag and tag.get('content'):
            return tag['content']
    except Exception as e:
        print('OG image error:', page_url, e)
    return None

## 2) Manifest 

In [None]:
manifest_path = DATA / 'manifest.csv'
if not manifest_path.exists():
    rows = [
        # Apple (5)
        {"brand":"Apple","title":"Come Rain or Come Shine","medium":"Film","year":"","region":"","page_url":"https://www.adsoftheworld.com/campaigns/come-rain-or-come-shine"},
        {"brand":"Apple","title":"Shot on iPhone","medium":"OOH/Print/Film","year":"2019","region":"US","page_url":"https://www.adsoftheworld.com/campaigns/shot-on-iphone"},
        {"brand":"Apple","title":"Your next computer is not a computer","medium":"Film/Digital","year":"2021","region":"US","page_url":"https://www.adsoftheworld.com/campaigns/your-next-computer-is-not-a-computer"},
        {"brand":"Apple","title":"Mac goes to College","medium":"Film/Digital","year":"2025","region":"US","page_url":"https://www.adsoftheworld.com/campaigns/mac-goes-to-college"},
        {"brand":"Apple","title":"Dear Apple","medium":"Film/Digital","year":"2022","region":"US","page_url":"https://www.adsoftheworld.com/campaigns/dear-apple"},
        # Samsung (5)
        {"brand":"Samsung","title":"The Real Upgrade","medium":"Film/Digital","year":"","region":"","page_url":"https://www.adsoftheworld.com/campaigns/the-real-upgrade"},
        {"brand":"Samsung","title":"No reflections","medium":"Print","year":"2020","region":"Germany","page_url":"https://www.adsoftheworld.com/campaigns/no-reflections"},
        {"brand":"Samsung","title":"Experience the wonder with the Galaxy","medium":"Film/Digital","year":"2021","region":"Mongolia","page_url":"https://www.adsoftheworld.com/campaigns/experience-the-wonder-with-the-galaxy"},
        {"brand":"Samsung","title":"Big News","medium":"Film/Digital","year":"2023","region":"Sweden","page_url":"https://www.adsoftheworld.com/campaigns/big-news"},
        {"brand":"Samsung","title":"For life with you","medium":"Film/Digital","year":"2025","region":"Brazil","page_url":"https://www.adsoftheworld.com/campaigns/for-the-life-with-you"},
        # Huawei (5)
        {"brand":"Huawei","title":"World, Unfolded.","medium":"Film","year":"2025","region":"","page_url":"https://www.adsoftheworld.com/campaigns/huawei-world-unfolded"},
        {"brand":"Huawei","title":"Giant Smart","medium":"Print/OOH","year":"2025","region":"AU/BR/FR","page_url":"https://www.adsoftheworld.com/campaigns/huawei-giant-smart"},
        {"brand":"Huawei","title":"Welcome to the Huawei AppGallery","medium":"Digital/Film","year":"2020","region":"Israel","page_url":"https://www.adsoftheworld.com/campaigns/welcome-to-the-huawei-appgallery"},
        {"brand":"Huawei","title":"The Unofficial Smartphone","medium":"Film/Digital","year":"2018","region":"Mexico","page_url":"https://www.adsoftheworld.com/campaigns/the-unofficial-smartphone"},
        {"brand":"Huawei","title":"Inspiration In Bloom","medium":"OOH/Print","year":"2025","region":"BR/FR/UK","page_url":"https://www.adsoftheworld.com/campaigns/huawei-inspiration-in-bloom"},
        # Yamaha (5)
        {"brand":"Yamaha","title":"Reveal","medium":"Print/OOH","year":"2014","region":"Italy","page_url":"https://www.adsoftheworld.com/campaigns/reveal"},
        {"brand":"Yamaha","title":"Off-ROAD MILKSHAKE","medium":"Activation/Film","year":"2025","region":"","page_url":"https://www.adsoftheworld.com/campaigns/off-road-milkshake"},
        {"brand":"Yamaha","title":"Neurons","medium":"Print","year":"2017","region":"Italy","page_url":"https://www.adsoftheworld.com/campaigns/neurons"},
        {"brand":"Yamaha","title":"Balls","medium":"Print","year":"2016","region":"Israel","page_url":"https://www.adsoftheworld.com/campaigns/balls-74ffa437-ae4d-4cd2-b8e7-b8d6596db2e4"},
        {"brand":"Yamaha","title":"Paper City","medium":"Print","year":"2011","region":"Italy","page_url":"https://www.adsoftheworld.com/campaigns/paper-city-31cdab6b-2bea-4394-be46-0dd3ffcba459"}
    ]
    pd.DataFrame(rows).to_csv(manifest_path, index=False)
manifest_path

## 3) Fetch hero images via `og:image`

In [None]:
df = pd.read_csv(manifest_path)
downloaded = []
for i, row in df.iterrows():
    img_url = og_image(row['page_url'])
    if not img_url:
        print('No image', row['page_url'])
        continue
    name = f"{row['brand']}_{slug(row['title'])}.jpg"
    path = IMAGES / name
    if path.exists():
        downloaded.append(str(path))
        continue
    try:
        r = requests.get(img_url, timeout=30)
        r.raise_for_status()
        open(path, 'wb').write(r.content)
        downloaded.append(str(path))
        time.sleep(0.7)
    except Exception as e:
        print('Download error', img_url, e)
len(downloaded), downloaded[:3]

## 4) DeepSeek‑OCR 
We use the **official pattern**: `AutoTokenizer/AutoModel(..., trust_remote_code=True)` and `model.infer(...)` with a plain OCR prompt. See model card/README for the exact versions and examples.

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'deepseek-ai/DeepSeek-OCR'

# trust_remote_code is required; model exposes a custom `.infer(...)` helper
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name,
    _attn_implementation='flash_attention_2',
    trust_remote_code=True,
    use_safetensors=True
)
model = model.eval()
if device == 'cuda':
    model = model.to(torch.bfloat16).cuda()

# Prompts per README examples; plain OCR works well for ads/key visuals
OCR_PROMPT = "<image>\nFree OCR."
# Alternative with layout grounding (often great for posters/PDFs):
# OCR_PROMPT = "<image>\n<|grounding|>Convert the document to markdown."

def deepseek_ocr_image(image_path: str) -> str:
    # Official helper function exposed via trust_remote_code — returns text
    res = model.infer(
        tokenizer,
        prompt=OCR_PROMPT,
        image_file=image_path,
        output_path=str(OUT),
        base_size=1024,
        image_size=640,
        crop_mode=True,
        save_results=False,
        test_compress=True
    )
    # Some builds return a dict; others return a string — normalize
    if isinstance(res, dict):
        return res.get('text', '') or res.get('output', '') or ''
    return str(res) if res is not None else ''

print('DeepSeek‑OCR loaded on', device)

## 5) Brand rules & entity heuristic

In [None]:
import re
CANON = ["apple","samsung","huawei","yamaha"]
ALIASES = {
    "apple":   [r"\biphone\b", r"\bipad\b", r"\bmac\b", r"\bapple watch\b", r"\bairpods\b", r"\bapple\b"],
    "samsung": [r"\bgalaxy\b", r"\bsamsung\b", r"\bz flip\b", r"\bs24\b", r"\bnote\b"],
    "huawei":  [r"\bhuawei\b", r"\bmate\b", r"\bpura\b", r"\bnova\b"],
    "yamaha":  [r"\byamaha\b", r"\byzf\b", r"\bfz\b", r"\br1\b", r"\bmt-0?\d\b", r"\brevstar\b", r"\bclavinova\b"],
}
BLACKLIST = set(CANON + ["iphone","ipad","mac","airpods","watch","galaxy","mate","pura","nova","yzf","fz","r1"]) 

def detect_brand(ocr_text: str) -> str:
    t = ocr_text.lower()
    for brand, pats in ALIASES.items():
        for p in pats:
            if re.search(p, t):
                return brand.capitalize()
    return "Unknown"

def short_entity(ocr_text: str, max_words: int = 6) -> str:
    words = re.findall(r"[a-zA-Z]+", ocr_text.lower())
    keep = [w for w in words if w not in BLACKLIST and 3 <= len(w) <= 12]
    return " ".join(keep[:max_words]) or "generic scene"

## 6) Run OCR over images → JSON dumps

In [None]:
records = []
for img_path in sorted(IMAGES.glob('*.jpg')):
    base = img_path.stem
    out = OCR_JSON / f"{base}.json"
    if out.exists():
        rec = json.load(open(out))
        records.append(rec)
        continue
    text = deepseek_ocr_image(str(img_path))
    rec = {"image": base, "text": text}
    json.dump(rec, open(out, 'w'))
    records.append(rec)
len(records), records[:2] if records else []

## 7) Predictions table

In [None]:
pred_rows = []
for rec in records:
    text = rec.get('text','')
    pred_rows.append({
        'image': rec['image'],
        'brand_pred': detect_brand(text),
        'entity': short_entity(text),
        'ocr_text': text
    })
pred_df = pd.DataFrame(pred_rows)
pred_csv = OUT / 'predictions.csv'
pred_df.to_csv(pred_csv, index=False)
pred_df.head(10)