## OCR Images

In [None]:
import requests
import json
import os

def detect_text_with_api_key_b64(encoded_image, api_key):
    url = f"https://vision.googleapis.com/v1/images:annotate?key={api_key}"
    payload = {
        "requests": [
            {
                "image": {"content": encoded_image},
                "features": [{"type": "TEXT_DETECTION"}]
            }
        ]
    }
    headers = {"Content-Type": "application/json"}
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    return response.json()

def ocr(in_path, out_path, api_key):
    with open(in_path, "r", encoding="utf-8") as fin, open(out_path, "w", encoding="utf-8") as fout:
        for line in fin:
            if not line.strip():
                continue
            rec = json.loads(line)
            page_url = rec["url"]
            images = rec["images"]

            questions = []
            for img in images:
                q = img.get("question", "")
                if "error" in img:
                    questions.append({"question": q, "error": img["error"]})
                else:
                    ret = detect_text_with_api_key_b64(img["data_base64"], api_key)
                    questions.append({"question": q, "ocr": ret})

            fout.write(json.dumps({"course_url": page_url, "questions": questions}, ensure_ascii=False) + "\n")

In [None]:
ocr("raw_data/image_bytes.jsonl", "raw_data/ocr_results.jsonl", api_key = os.getenv("GOOGLE_API_KEY"))