# Get All Articles data from the batch

In [None]:
import json

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

file_path = "raw_data.json"
json_data = read_json_file(file_path)

In [None]:
len(json_data['documents'])

2080

In [None]:
import json
from bs4 import BeautifulSoup
import uuid

In [None]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/296.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [None]:
import requests
from PIL import Image
import imagehash
from io import BytesIO

def compare_images_by_hash(url1, url2):
    response1 = requests.get(url1)
    response2 = requests.get(url2)
    image1 = Image.open(BytesIO(response1.content))
    image2 = Image.open(BytesIO(response2.content))

    hash1 = imagehash.average_hash(image1)
    hash2 = imagehash.average_hash(image2)

    return hash1 == hash2

### Extract all images to one json file

In [None]:
def extract_images_to_json(data):
    output = {"documents": []}

    for article in data["documents"]:
        soup = BeautifulSoup(article.get("content", ""), "html.parser")
        for element in soup(["script", "style"]):
            element.decompose()

        cleaned_content = soup.get_text(separator=" ", strip=True)

        # Extract text data
        article_data = {
            "title": article.get("title", ""),
            "summary": article.get("summary", ""),
            "authors": article.get("authors", []),
            "tags": article.get("tags", []),
            "published_at": article.get("published_at", ""),
            "article_url": article.get("article_url", ""),
            "media_type": article.get("media_type", []),
            "full_text": cleaned_content,
            "images": [],
        }

        # Extract feature image from JSON
        feature_image = article.get("feature_image")
        feature_image_data = next((img for img in article.get("images", []) if img["url"] == feature_image), None)
        if feature_image_data:
            article_data["images"].append({
                "url": feature_image_data["url"],
                "alt_text": feature_image_data.get("alt", ""),
                "description": "",
            })

        # Parse HTML content for inline images
        soup = BeautifulSoup(article.get("content", ""), "html.parser")
        img_tags = soup.find_all("img")

        for img in img_tags:
            img_url = img.get("src")
            img_alt = img.get("alt", "")
            if img_url and img_url not in [i["url"] for i in article_data["images"]]:

                # for image in article_data["images"]:
                #     if compare_images_by_hash(image['url'], img_url):
                #         continue

                article_data["images"].append({
                    "url": img_url,
                    "alt_text": img_alt,
                    "description": "",
                })

        output["documents"].append(article_data)

    return output

In [None]:
extract_data_for_processing = extract_images_to_json(json_data)

In [None]:
url = extract_data_for_processing['documents'][0]['images'][0]['url']
url

'https://dl-staging-website.ghost.io/content/images/2025/06/unnamed---2025-06-04T165354.442-1.png'

# Get all using gemini

### Describe images for rag

In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.26.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.26.0-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.6/129.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.26.0


In [None]:
from groq import Groq

client = Groq(api_key='your key')

def describe_image_with_groq(image_url: str) -> str:
    """
    Generates a detailed image description and short summary using Groq LLaMA-4 Scout model.

    Args:
        image_url (str): A public URL to the image.

    Returns:
        str: A paragraph with description and summary.
    """
    system_prompt = (
        "You are an expert editor working on educational content for an AI blog. "
        "Given this image, generate:\n"
        "A detailed but clear description of what the image shows, suitable for documentation or captions.\n"
        "A short summary (1–2 sentences) that explains the significance of the image in plain English, as if summarizing for a newsletter.\n"
        "Use concise, accurate, and professional language.\n"
        "Write all important information that the image contains.\n"
        "No more than 150 tokens.\n"
        "Structure information in one paragraph."
    )

    response = client.chat.completions.create(
        model="meta-llama/llama-4-scout-17b-16e-instruct",
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": ""},
                    {"type": "image_url", "image_url": {"url": image_url}}
                ]
            }
        ],
        temperature=1,
        max_completion_tokens=300,
        top_p=1,
        stream=False
    )

    return response.choices[0].message.content.strip()

def describe_text_with_groq(title, summary, full_text, tags):
    """
    Generates a detailed text description using Groq LLaMA-4 Scout model based on article metadata.

    Args:
        title (str): Article title.
        summary (str): Article summary.
        full_text (str): Cleaned article content.
        tags (list): Article tags.
        alt_text (str): text alt text.

    Returns:
        str: A paragraph with the text description, or empty string on failure.
    """
    system_prompt = (
        "You are an expert editor working on educational content for an AI blog. "
        "Given textual metadata about an article"
        "suitable for documentation or captions in a Retrieval-Augmented Generation (RAG) system. "
        "Base the description on the article’s title, summary, full text, tags, and alt text, inferring visual elements relevant to the article’s theme (e.g., AI, technology, research). "
        "Use concise, accurate, and professional language. Structure the description as one paragraph. Limit to 150 tokens."
        "No more than 150 tokens.\n"
        "Structure information in one paragraph."
        "Write all important information that the text contains.\n"
    )

    user_prompt = f"""**Input**:
- Article Title: {title}
- Article Summary: {summary}
- Full Text: {full_text}
- Tags: {', '.join(tags)}

**Output**:
A single paragraph describing the image.
"""

    try:
        response = client.chat.completions.create(
            model="meta-llama/llama-4-scout-17b-16e-instruct",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=1,
            max_completion_tokens=300,
            top_p=1,
            stream=False
        )

        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating description with Groq: {e}")
        return ""

In [None]:
title = extract_data_for_processing['documents'][0]['title']
summary = extract_data_for_processing['documents'][0]['summary']
tags = extract_data_for_processing['documents'][0]['tags']
tags = ", ".join(tags)
full_text = extract_data_for_processing['documents'][0]['full_text']

describe_text_with_groq(title=title, summary=summary, tags=tags, full_text=full_text)

'Here is a paragraph describing the article in 150 tokens or less:\n\nColumbia University researchers have found a way to trick AI agents with poisoned links, exploiting their implicit trust in popular websites. By crafting malicious posts on sites like Reddit, attackers can mislead agents into divulging sensitive information or taking harmful actions. In tests, agents consistently followed instructions on malicious websites, revealing credit card details and sending phishing emails. This highlights the need for more secure agent design to resist such manipulation and ensure safer online interactions.'

In [None]:
!pip install tqdm



### Function for open ai to create description for gif images (Lamma can`t process .gif)

In [None]:
from openai import OpenAI

def describe_image_with_open_ai(image_url: str) -> str:
    """
    Generates a detailed image description and short summary using Groq LLaMA-4 Scout model.

    Args:
        image_url (str): A public URL to the image.

    Returns:
        str: A paragraph with description and summary.
    """
    system_prompt = (
        "You are an expert editor working on educational content for an AI blog. "
        "Given this image, generate:\n"
        "A detailed but clear description of what the image shows, suitable for documentation or captions.\n"
        "A short summary (1–2 sentences) that explains the significance of the image in plain English, as if summarizing for a newsletter.\n"
        "Use concise, accurate, and professional language.\n"
        "Write all important information that the image contains.\n"
        "No more than 150 tokens.\n"
        "Structure information in one paragraph."
    )


    client = OpenAI(api_key="your key")
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Write description to the image."},
                    {"type": "image_url", "image_url": {"url": image_url}}
                ]
            }
        ],
        temperature=1.0,
        max_tokens=300,
        top_p=1.0
    )

    return response.choices[0].message.content.strip()

# Get all description


Create description using Lamma4 model (long process)


In [None]:
from tqdm import tqdm

start_from = 0

def get_data_descriptions(data):
    output = {"documents": []}

    for idx, article in enumerate(tqdm(data["documents"][start_from:]), start=start_from):
        # Extract text data
        article_data = {
            "title": article.get("title", ""),
            "summary": article.get("summary", ""),
            "authors": article.get("authors", []),
            "tags": article.get("tags", []),
            "article_url": article.get("article_url", ""),
            "text_description": "",
            "images_metadata": [],
        }


        article_data['text_description'] = describe_text_with_groq(
                title=article["title"],
                summary=article["summary"],
                full_text=article['full_text'],
                tags=article["tags"],
            )

        # Extract feature image
        feature_image = article.get("feature_image")
        feature_image_data = next((img for img in article.get("images", []) if img["url"] == feature_image), None)
        if feature_image_data:
            description = describe_image_with_groq(feature_image_data["url"])
            article_data["images_metadata"].append({
                "url": feature_image_data["url"],
                "alt_text": feature_image_data.get("alt", ""),
                "description": description
            })
        count = 0
        # Parse HTML content for inline images
        for img in article['images']:
            img_url = img['url']
            print('check', img_url)
            try:
                if img_url:
                    if img_url.lower().endswith(".gif"):
                        description = describe_image_with_open_ai(img_url)
                    else:
                        description = describe_image_with_groq(img_url)
                    print('temp')

                    article_data["images_metadata"].append({
                        "url": img_url,
                        "description": description
                    })

                    count = 0
            except Exception as e:
                    if count == 10:
                        return output
                    count += 1
                    print(f"Error generating description to image: {e}")
                    continue

        output["documents"].append(article_data)

        if (idx + 1) % 50 == 0:
            with open(f"intermediate_output_{idx+1}.json", "w", encoding="utf-8") as f:
                json.dump(output, f, ensure_ascii=False, indent=2)
            print(f"Saved intermediate output after {idx+1} documents")

        print("Continue Progres")

    with open(f"intermediate_output_2080.json", "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"Saved intermediate output after 2080 documents")

    return output


In [None]:
test = get_data_descriptions(extract_data_for_processing)

  0%|          | 0/30 [00:00<?, ?it/s]

check https://dl-staging-website.ghost.io/content/images/2021/07/Here-Be-Dragons-1.gif


  3%|▎         | 1/30 [00:05<02:35,  5.35s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Two-Way-Winner-1.gif


  7%|▋         | 2/30 [00:09<02:17,  4.92s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Cancer-in-the-Crosshairs-1.png


 10%|█         | 3/30 [00:11<01:31,  3.40s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Deepfakes-Go-Mainstream-1.png


 13%|█▎        | 4/30 [00:12<01:07,  2.61s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Self--Training-for-Sharper-Vision-1.png


 17%|█▋        | 5/30 [00:14<00:57,  2.28s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/IP-for-AI-1.png


 20%|██        | 6/30 [00:16<00:49,  2.05s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Beyond-the-Bounding-Box-1.png


 23%|██▎       | 7/30 [00:18<00:46,  2.01s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/No-Escape-From-Surveillance-1.png


 27%|██▋       | 8/30 [00:19<00:39,  1.81s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Autonomous-Drones-Ready-to-Race-1.gif


 30%|███       | 9/30 [00:23<00:52,  2.49s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Two-Steps-to-Better-Summaries-1.png


 33%|███▎      | 10/30 [00:25<00:46,  2.31s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2021/07/Seeing-Cancer-1.png


 37%|███▋      | 11/30 [00:27<00:41,  2.18s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/83fed3d3-b632-40db-a0db-8c3adebe2593--1-.png


 40%|████      | 12/30 [00:29<00:38,  2.13s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/ffa2c81d-64a1-4420-adb2-902f77210652--1-.gif


 43%|████▎     | 13/30 [00:34<00:49,  2.93s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/1bb4e782-3240-4d26-9dd8-94ac7513a26e--1-.png


 47%|████▋     | 14/30 [00:36<00:42,  2.63s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/89537a7f-644d-442c-b3fc-2fd8f8840fd6--1-.gif


 50%|█████     | 15/30 [00:40<00:46,  3.07s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/7c91b1fc-aea3-4d3f-bc72-b9f61f8fe563--1-.png


 53%|█████▎    | 16/30 [00:42<00:38,  2.73s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/c680f330-d8bd-4f4d-bcd3-a1107006f693--1-.png


 57%|█████▋    | 17/30 [00:43<00:30,  2.36s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/c9cb6f47-df67-4932-9e45-b44498a9ae3f--1-.png


 60%|██████    | 18/30 [00:45<00:25,  2.11s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/7c80dbca-5e40-4d92-936e-4f24cce1ccf0--1-.gif


 63%|██████▎   | 19/30 [00:52<00:41,  3.76s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/84086bd8-fb32-4342-a502-2fdcd6401767--1-.gif


 67%|██████▋   | 20/30 [00:57<00:40,  4.03s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/02cb5e64-9c27-4a27-8ba7-6946a2276ae5--1-.png


 70%|███████   | 21/30 [00:59<00:30,  3.34s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/2d9d1be7-7aa6-4ee3-997e-c4a841212126--1-.png


 73%|███████▎  | 22/30 [01:01<00:23,  2.93s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/08f19ed5-ed27-4135-a075-ab947d13f00d--1-.jpg


 77%|███████▋  | 23/30 [01:02<00:17,  2.50s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/6c810bee-5620-4525-a0aa-dcf3aea461f0--1-.png


 80%|████████  | 24/30 [01:04<00:13,  2.29s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/066fb5f1-4c1f-49b5-8931-ec6ebe83e3de--1-.png


 83%|████████▎ | 25/30 [01:08<00:13,  2.78s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/50e3df2e-dbe2-4a2b-ab4b-623ace68e880--1--1.png


 87%|████████▋ | 26/30 [01:14<00:14,  3.67s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/60ae2339-9012-4631-82d5-c99f911dacd5--1-.png


 90%|█████████ | 27/30 [01:19<00:12,  4.25s/it]

temp
Continue Progres
check https://dl-staging-website.ghost.io/content/images/2022/10/afe604b0-915a-4e9d-a0c6-5a728f219cc3--1-.png


 93%|█████████▎| 28/30 [01:25<00:09,  4.63s/it]

temp
Continue Progres


 97%|█████████▋| 29/30 [01:27<00:03,  3.86s/it]

Continue Progres


100%|██████████| 30/30 [01:30<00:00,  3.01s/it]

Continue Progres
Saved intermediate output after 2080 documents





https://dl-staging-website.ghost.io/content/images/2025/02/unnamed--49-.gif

In [None]:
data_description

{'documents': [{'article_url': 'https://www.deeplearning.ai/the-batch/columbia-university-researchers-show-how-to-trick-trusting-ai-agents-with-poisoned-links/',
   'text_description': 'Here is a description of the article in one paragraph, within the 150-token limit:\n\nColumbia University researchers have found a way to trick AI agents with poisoned links, exploiting their implicit trust in popular websites. By crafting malicious posts on sites like Reddit, attackers can manipulate agents into divulging sensitive information or taking harmful actions. In tests, agents reliably followed instructions on malicious websites, revealing credit card information and sending phishing emails. This vulnerability highlights the need for more secure AI agent design to resist such manipulation and prevent online harm.',
   'images_metadata': []},
  {'article_url': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/'