# Task
Was made by Ostap Pavlyshyn

## Data Parsing via BeautifulSoup function in Python

In [177]:
import requests
from bs4 import BeautifulSoup
import json
import time

In [176]:
json_with_issues = {}

In [178]:
def extract_articles_with_images(soup):
    articles = []
    h_tags = soup.find_all('h1') + soup.find_all('h2') + soup.find_all('h3')

    for i, h1 in enumerate(h_tags):
        title = h1.get_text(strip=True)
        lower_title = title.lower()
        if not title or 'news' in lower_title or 'issue' in lower_title or 'message' in lower_title or 'subscribe' in lower_title:
            continue
        image_url = None

        prev = h1.find_previous_sibling()
        while prev and prev.name != 'figure':
            prev = prev.find_previous_sibling()
        if prev and prev.name == 'figure':
            img_tag = prev.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag['src']

        article_parts = []
        next_h1 = h_tags[i + 1] if i + 1 < len(h_tags) else None
        current = h1.find_next_sibling()

        while current and current != next_h1:
            if current.name in ['p', 'ul']:
                article_parts.append(current.get_text(strip=True))
            current = current.find_next_sibling()

        if not article_parts:
            continue
        
        articles.append({
            "title": title,
            "image": image_url,
            "content": "\n".join(article_parts)
        })

    return articles

In [179]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                   (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}

for i in range(1, 23):
    batch_url = f"https://www.deeplearning.ai/the-batch/page/{i}/"
    response = requests.get(batch_url)

    if response.status_code == 429:
        time.sleep(25)
        response = requests.get(batch_url, headers=headers)

    soup = BeautifulSoup(response.text, "html.parser")
    script_tag = soup.find("script", id="__NEXT_DATA__")
    
    data = json.loads(script_tag.string)

    posts = data["props"]["pageProps"]["posts"]

    for post in posts:
        title = post["title"]
        slug = post["slug"]
        issue_url = f"https://www.deeplearning.ai/the-batch/{slug}/"
        soup_url = requests.get(issue_url).text
        soup_urll = BeautifulSoup(soup_url, "html.parser")
        articles = extract_articles_with_images(soup_urll)
        json_with_issues[slug] = {
            "title": title,
            "url": issue_url,
            "articles": articles
        }

In [180]:
print(len(json_with_issues))
json_with_issues["issue-i"]

316


{'title': 'The Batch: Initializing Neural Networks Tutorial, Automatic Annotation, The Robots are Winning, Drones Go Commercial',
 'url': 'https://www.deeplearning.ai/the-batch/issue-i/',
 'articles': [{'title': 'Initializing Neural Networks',
   'image': 'https://dl-staging-website.ghost.io/content/images/2022/09/dfbcdc70-a9a2-4967-95b8-6866a6a0a6bf-1.gif',
   'content': 'Initialization can have a significant impact on convergence in training deep neural networks. Simple initialization schemes can accelerate training, but they require care to avoid common pitfalls. In this interactive tutorial, we’ll explain how to initialize neural network parameters effectively.Learn more'},
  {'title': 'Automatic Annotation',
   'image': 'https://dl-staging-website.ghost.io/content/images/2022/09/dfbcdc70-a9a2-4967-95b8-6866a6a0a6bf-1.gif',
   'content': 'A new tool promises to speed up the laborious process of annotating computer-vision training data.\nWhat’s new:Curve-GCNestimates object outlines

## Vector DataBases (Chroma)

In [182]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def auto_chunk_articles(issues, chunk_size=640, chunk_overlap=100, length_threshold=500):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    all_chunks = []
    chunk_id = 0

    for issue in issues:
        issue_title = issue.get("title")
        issue_url = issue.get("url")

        for article in issue.get("articles", []):
            article_title = article.get("title")
            content = article.get("content", "").strip()
            image_url = article.get("image", None)

            if not content:
                continue

            word_count = len(content.split())

            if word_count <= length_threshold:
                all_chunks.append({
                    "id": f"chunk_{chunk_id}",
                    "content": content,
                    "metadata": {
                        "article_title": article_title,
                        "issue_title": issue_title,
                        "issue_url": issue_url,
                        "image_url": image_url
                    }
                })
                chunk_id += 1

            else:
                split_chunks = splitter.split_text(content)
                for chunk in split_chunks:
                    all_chunks.append({
                        "id": f"chunk_{chunk_id}",
                        "content": chunk,
                        "metadata": {
                            "article_title": article_title,
                            "issue_title": issue_title,
                            "issue_url": issue_url,
                            "image_url": image_url
                        }
                    })
                    chunk_id += 1

    return all_chunks

In [183]:
chunks = auto_chunk_articles(json_with_issues.values(), chunk_size=640, chunk_overlap=100, length_threshold=500)

In [218]:
print(f"Total chunks: {len(chunks)}")
print(chunks[0])

Total chunks: 4754
{'id': 'chunk_0', 'content': 'Microsoft published its latest recipe for training reasoning models, substantially expanding what is still a fairly small base of public knowledge.\nWhat’s new:Microsoft releasedPhi-4-reasoning, Phi-4-reasoning-plusandPhi-4-mini-reasoningalong with lessons learned in building the models.', 'metadata': {'article_title': 'Reasoning Models With\xa0Recipes', 'issue_title': 'Recipes For Reasoning, Open and Compact Code Generator, Looser AI Regulations, More Factual Output', 'issue_url': 'https://www.deeplearning.ai/the-batch/issue-301/', 'image_url': 'https://dl-staging-website.ghost.io/content/images/2025/05/unnamed--89-.png'}}


In [231]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")

In [None]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.docstore.document import Document as LCDocument

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    task_type="SEMANTIC_SIMILARITY",
    google_api_key=api_key
)

lc_docs = [
    LCDocument(page_content=chunk["content"], metadata=chunk["metadata"])
    for chunk in chunks
]

vectordb = Chroma.from_documents(
    documents=lc_docs,
    embedding=embedding_model,
    persist_directory="./database"
)

In [238]:
system_prompt = (
        "You are an assistant that answers user questions using factual information extracted "
        "from articles in The Batch newsletter.\n\n"
        "If you see that titles are the same and urls as well, it means that the same article "
        "is repeated in the same issue. You should return only one answer combined of them. "
        "If you see different titles, you can return them separately.\n\n"
        "Return your answer in **strict JSON format** using the following schema:\n"
        "{"
        "  \"answers\": ["
        "    {"
        "      \"number\": <int>,"
        "      \"text\": <string>,"
        "      \"title\": <string>,"
        "      \"url\": <string>,"
        "      \"image_url\": <string>"
        "    }, "
        "    ..."
        "  ]"
        "}"
        "Make sure to:\n"
        "- Only use info from the provided context\n"
        "- Enumerate answers clearly using the \"number\" field\n"
        "- Always include the article \"title\" and \"urls\"\n"
        "- If the answer isn't available, return: { \"answers\": [\"text\":\"Sorry, try again\"] }"
    )

In [205]:
query = "tell me about drones"

In [236]:
docs = vectordb.similarity_search(query, k=3)

context = "\n\n".join(
    f"- Title: {doc.metadata.get('article_title')}\n"
    f"  URL: {doc.metadata.get('issue_url')}\n"
    f"  Image URL: {doc.metadata.get('image_url')}\n"
    f"  Content: {doc.page_content.strip()}"
    for doc in docs
)

In [209]:
import google.generativeai as genai
genai.configure(api_key=api_key)

model = genai.GenerativeModel("models/gemini-1.5-flash")

combined_prompt = (
    f"System: {system_prompt}\n\n"
    f"Use the following context to answer:\n\n{context}\n\nUser's question: {query}"
)

messages = [{"role": "user", "parts": [combined_prompt]}]

response = model.generate_content(messages)

print(response.text)

```json
{
  "answers": [
    {
      "number": 1,
      "text": "For the first time, U.S. regulators allowed commercial operators of autonomous aerial vehicles to fly beyond the operators' visual range.  The FAA typically requires ground-based observation of drones, but granted an exception to drone manufacturer American Robotics. Their 20-pound quadcopters follow set paths and use an acoustic sensing system to automatically avoid collisions.  A pre-flight checklist is required, though this can be done remotely. Flights are limited to daylight hours, altitudes under 400 feet, and specific areas in Kansas, Massachusetts, and Nevada.  Companies can apply to the FAA for waivers to the line-of-sight rule; American Robotics was the first, after four years of testing.  This development is a significant step toward more manageable and economical drone operations.",
      "title": "Drones Unleashed",
      "url": "https://www.deeplearning.ai/the-batch/issue-76/",
      "image_url": "https://ho

In [233]:
import re

def extract_json_from_response(text: str) -> dict:
    if text.lower().strip().startswith("json"):
        text = text[text.lower().find("{"):]
    text = text.strip().strip("`")

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            return json.loads(match.group())
        raise ValueError("Failed to extract valid JSON from response.")


In [216]:
js = extract_json_from_response(response.text)
if js.get("answers"):
    for answer in js["answers"]:
        print(f"**Answer {answer['number']}:** {answer['text']}")
        print(f"[{answer['title']}]({answer['url']})")
        if answer.get("image_url"):
            print(answer["image_url"])

**Answer 1:** For the first time, U.S. regulators allowed commercial operators of autonomous aerial vehicles to fly beyond the operators' visual range.  The FAA typically requires ground-based observation of drones, but granted an exception to drone manufacturer American Robotics. Their 20-pound quadcopters follow set paths and use an acoustic sensing system to automatically avoid collisions.  A pre-flight checklist is required, though this can be done remotely. Flights are limited to daylight hours, altitudes under 400 feet, and specific areas in Kansas, Massachusetts, and Nevada.  Companies can apply to the FAA for waivers to the line-of-sight rule; American Robotics was the first, after four years of testing.  This development is a significant step toward more manageable and economical drone operations.
[Drones Unleashed](https://www.deeplearning.ai/the-batch/issue-76/)
https://home-wordpress.deeplearning.ai/wp-content/uploads/2021/02/DRONES.gif


## RAGAS Evaluation

In [241]:
from datasets import Dataset
import pandas as pd

sample_data = {
    "question": [
        "What are the benefits of proper neural network initialization?",
        "How does Curve-GCN improve data annotation?",
        "What did Europe publish about AI ethics?"
    ],
    "answer": [
        "Proper initialization of neural networks can significantly impact convergence during training. Simple initialization methods can accelerate training, but require attention to detail to avoid potential issues.",
        "Curve-GCN is a new tool that speeds up the annotation of computer-vision training data. It estimates object outlines, allowing for adjustments for a tighter fit. Researchers found it to be much faster than other methods. The process starts with drawing a bounding box; Curve-GCN then automatically outlines the object's perimeter, which can be manually adjusted. This process, taking only seconds, was trained on Cityscapes (urban scenes) data but works well with various scenes, including aerial and medical imagery. By automating this laborious process, Curve-GCN allows for the creation of larger labeled image sets, leading to faster and more effective training.",
        "Europe's sweeping AI law moved decisively toward approval."
    ],
    "contexts": [
        [
            "Initialization can have a significant impact on convergence in training deep neural networks. "
            "Simple initialization schemes can accelerate training, but they require care to avoid common pitfalls."
        ],
        [
            "Curve-GCN estimates object outlines and lets you tweak them for a tighter fit. "
            "You can drag the outline into a more precise location, and the tool will recalculate the line."
        ],
        [
            "The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical development of AI. "
            "AI must be legal, ethical, robust, and respectful of human welfare and autonomy."
        ]
    ],
    "ground_truth": [
        "Proper initialization accelerates training and avoids pitfalls in deep neural networks.",
        "Curve-GCN helps annotate images faster by generating editable object outlines.",
        "Europe published ethical AI guidelines emphasizing legality and human rights."
    ]
}

dataset = Dataset.from_pandas(pd.DataFrame(sample_data))

In [242]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from ragas.llms.base import LangchainLLMWrapper
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
)

llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=api_key,
        convert_system_message_to_human=True
    )
)

embedding_model = LangchainEmbeddingsWrapper(
    GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        task_type="SEMANTIC_SIMILARITY",
        google_api_key=api_key
    )
)

results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        answer_correctness,
    ],
    llm=llm,
    embeddings=embedding_model
)

print("="*10)
print(results)

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 37
}
].
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and bil

{'faithfulness': 0.5625, 'answer_relevancy': 0.8847, 'context_precision': 1.0000, 'context_recall': 1.0000, 'answer_correctness': 0.4857}
