# Update photo saving

In [1]:

import mimetypes

In [2]:
mimetypes.guess_type('game.gif')

('image/gif', None)

In [3]:
from config import Config

def filtration_by_filename(photo_filename: str) -> bool:
    for rule in Config.filtration_keywords:
        if all(keyword in photo_filename for keyword in rule):
                return True
    return False

print(filtration_by_filename('img_0_1_coursead-dlsoptimizationalgorithms'))
print(filtration_by_filename('issue_1_title_bert_is_back_img_0_1_cropped20roberta'))


True
False


In [6]:
import os
import json
import requests
from urllib.parse import urlparse
import re
from config import Config
from tqdm import tqdm

with open(f'../{Config.parsed_full_data_path}', 'r', encoding='utf-8') as f:
    articles = json.load(f)

output_dir = f'../{Config.image_dataset_path}'
os.makedirs(output_dir, exist_ok=True)


def slugify(text):
    if text is not None:
        text = text.lower().strip().replace(' ', '_')
    else:
        text = 'no_text'
    return re.sub(r'[^a-zA-Z0-9_\-]', '', text)


def filtration_by_filename(photo_filename: str) -> bool:
    for rule in Config.filtration_keywords:
        if all(keyword in photo_filename for keyword in rule):
                return True
    return False

uris = []
metadata = [] # now is only url and title

# Download images
for article in tqdm(articles):
    url = article.get('url', 'None')
    issue = article.get('issue', 'unknown')
    title_slug = slugify(article.get('title', 'no_title'))

    for idx, img_url in enumerate(article.get('images', [])):
        try:
            parsed_url = urlparse(img_url)
            img_name = os.path.basename(parsed_url.path)
            img_ext = os.path.splitext(img_name)[-1] or '.jpg'
            img_file_slug = slugify(os.path.splitext(img_name)[0])
            filename = f'issue_{issue}_title_{title_slug}_img_{idx}_{img_file_slug}{img_ext}'
            if filtration_by_filename(filename):
                print(f"Filtered: {filename}")
                continue
  
            filepath = os.path.join(output_dir, filename)
            
            # load and save image from url
            response = requests.get(img_url, timeout=10)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"Saved: {filename}")
            
            # save data parameters into json
            uris.append(filename)
            print({'url': url, 'issue':issue, 'title':title_slug})
            metadata.append({'url': url, 'issue':issue, 'title':title_slug})
        except Exception as e:
            print(f"Error downloading {img_url}: {e}")

export_data = {
    "uris": uris,
    "metadata": metadata
}

with open(f'../{Config.dataset_path}/photo_data.json', 'w') as f:
    json.dump(export_data, f)

print(f"Data exported to {Config.dataset_path}/photo_data.json")

100%|██████████| 1134/1134 [00:00<00:00, 35394.71it/s]

issue_1_title_this_shirt_hates_surveillance_img_0_adversarial20fashion.gif
{'url': 'https://www.deeplearning.ai/the-batch/issue-1/', 'issue': 1, 'title': 'this_shirt_hates_surveillance'}
issue_1_title_deepmind_results_raise_questions_img_0_1_deepmind.jpg
{'url': 'https://www.deeplearning.ai/the-batch/issue-1/', 'issue': 1, 'title': 'deepmind_results_raise_questions'}
issue_1_title_bert_is_back_img_0_1_cropped20roberta.png
{'url': 'https://www.deeplearning.ai/the-batch/issue-1/', 'issue': 1, 'title': 'bert_is_back'}
issue_1_title_a_message_fromdeeplearningai_img_0_1_coursead-dlsoptimizationalgorithms.png
Filtered: issue_1_title_a_message_fromdeeplearningai_img_0_1_coursead-dlsoptimizationalgorithms.png
issue_1_title_standards_in_the_making_img_0_1_nist.png
{'url': 'https://www.deeplearning.ai/the-batch/issue-1/', 'issue': 1, 'title': 'standards_in_the_making'}
issue_1_title_style_upgrade_img_0_1_style20320sized-1024x577.png
{'url': 'https://www.deeplearning.ai/the-batch/issue-1/', 'issu




919

In [8]:
import os
import json
import requests
from urllib.parse import urlparse
import re
from config import Config
from tqdm import tqdm
from PIL import Image
from io import BytesIO

# Load articles
with open(f'../{Config.parsed_full_data_path}', 'r', encoding='utf-8') as f:
    articles = json.load(f)

# Prepare output directory
output_dir = f'../{Config.image_dataset_path}'
os.makedirs(output_dir, exist_ok=True)

def slugify(text):
    if text is not None:
        text = text.lower().strip().replace(' ', '_')
    else:
        text = 'no_text'
    return re.sub(r'[^a-zA-Z0-9_\-]', '', text)

def filtration_by_filename(photo_filename: str) -> bool:
    for rule in Config.filtration_keywords:
        if all(keyword in photo_filename for keyword in rule):
            return True
    return False

# Download images
uris = []
metadata = []

for article in tqdm(articles):
    url = article.get('url', 'None')
    issue = article.get('issue', 'unknown')
    title_slug = slugify(article.get('title', 'no_title'))

    for idx, img_url in enumerate(article.get('images', [])):
        try:
            parsed_url = urlparse(img_url)
            img_name = os.path.basename(parsed_url.path)
            img_ext = os.path.splitext(img_name)[-1] or '.jpg'
            img_file_slug = slugify(os.path.splitext(img_name)[0])
            filename = f'issue_{issue}_title_{title_slug}_img_{idx}_{img_file_slug}{img_ext}'

            if filtration_by_filename(filename):
                #print('filtrated:', filename)
                continue

            filepath = os.path.join(output_dir, filename)

            # Step 1: Download
            response = requests.get(img_url, timeout=10)
            response.raise_for_status()

            # Step 2: Check Content-Type is image
            if 'image' not in response.headers.get('Content-Type', ''):
                continue

            # Step 3: Validate the image can be opened
            try:
                img = Image.open(BytesIO(response.content))
                img.verify()  # Checks for corrupted image
            except Exception:
                continue  # Skip invalid images

            # Step 4: Save the validated image
            with open(filepath, 'wb') as f:
                f.write(response.content)

            # Save metadata
            uris.append(filename)
            metadata.append({'url': url, 'issue': issue, 'title': title_slug})

        except Exception:
            # Skip image if any error occurs
            continue

# Save metadata to JSON
export_data = {
    "uris": uris,
    "metadata": metadata
}

with open(f'../{Config.dataset_path}/photo_data.json', 'w', encoding='utf-8') as f:
    json.dump(export_data, f, ensure_ascii=False, indent=4)


  0%|          | 3/1134 [00:02<16:06,  1.17it/s]

filtrated: issue_1_title_a_message_fromdeeplearningai_img_0_1_coursead-dlsoptimizationalgorithms.png


  2%|▏         | 22/1134 [00:17<12:32,  1.48it/s]

filtrated: issue_5_title_a_message_fromdeeplearningai_img_0_1_dls20course20420course20ad20fixed20size.png


  3%|▎         | 36/1134 [00:27<14:13,  1.29it/s]

filtrated: issue_7_title_a_message_fromdeeplearningai_img_0_1_dls20course20520course20ad.png


  4%|▍         | 43/1134 [00:32<12:41,  1.43it/s]

filtrated: issue_8_title_a_message_fromdeeplearningai_img_0_course20ad2016.png


  4%|▍         | 50/1134 [00:36<12:54,  1.40it/s]

filtrated: issue_9_title_a_message_fromdeeplearningai_img_0_dls20course20420course20ad20fixed20size.png


  5%|▍         | 56/1134 [00:40<10:32,  1.70it/s]

filtrated: issue_10_title_a_message_fromdeeplearningai_img_0_dls20course20120course20ad.png


  6%|▋         | 71/1134 [00:50<13:36,  1.30it/s]

filtrated: issue_14_title_a_message_fromdeeplearningai_img_0_1_dls20course20520course20ad.png


  7%|▋         | 76/1134 [00:53<11:48,  1.49it/s]

filtrated: issue_15_title_a_message_fromdeeplearningai_img_0_1_dls20course20420course20ad20fixed20size.png


 30%|██▉       | 336/1134 [04:16<04:52,  2.73it/s]

filtrated: issue_72_title_onward_to_2021_img_0_screen20shot202020-12-2920at205-2.webp
filtrated: issue_72_title_no_text_img_0_ayanna-howar-screen20shot202020-12-2920at2010.webp


 33%|███▎      | 371/1134 [04:42<10:59,  1.16it/s]

filtrated: issue_78_title_a_message_fromdeeplearningai_img_0_the-batch-image-1024x576.png


 33%|███▎      | 377/1134 [04:44<04:39,  2.71it/s]

filtrated: issue_83_title_a_message_fromdeeplearningai_img_0_mlops-march-24_the-batch-image-2048x115220copy.png


 39%|███▉      | 443/1134 [04:55<02:37,  4.39it/s]

filtrated: issue_97_title_a_message_fromdeeplearningai_img_0_course20name201-2.png


 39%|███▉      | 446/1134 [04:56<02:15,  5.10it/s]

filtrated: issue_98_title_a_message_fromdeeplearningai_img_0_the-batch-image-4--1---2-.png


 40%|███▉      | 452/1134 [04:57<02:44,  4.14it/s]

filtrated: issue_99_title_no_text_img_0_course-name-2-2.png


 40%|████      | 458/1134 [04:59<02:47,  4.05it/s]

filtrated: issue_101_title_no_text_img_0_course-name-3-3.png


 41%|████▏     | 468/1134 [05:02<03:33,  3.12it/s]

filtrated: issue_103_title_no_text_img_0_experts-panel-811_the-batch-image.png


 42%|████▏     | 478/1134 [05:04<03:09,  3.47it/s]

filtrated: issue_106_title_no_text_img_0_course-name-banner-4-1.png


 43%|████▎     | 483/1134 [05:05<02:49,  3.85it/s]

filtrated: issue_107_title_no_text_img_0_course-name-banner-4--1-.png


 43%|████▎     | 488/1134 [05:07<02:36,  4.14it/s]

filtrated: issue_108_title_no_text_img_0_course-name-banner-4-2.png


 43%|████▎     | 493/1134 [05:08<03:06,  3.43it/s]

filtrated: issue_109_title_no_text_img_0_9.webp


 44%|████▍     | 498/1134 [05:09<03:08,  3.37it/s]

filtrated: issue_110_title_no_text_img_0_ai-x-929_the-batch-image.png


 45%|████▍     | 506/1134 [05:11<02:45,  3.81it/s]

filtrated: issue_115_title_something_wicked_this_way_comes_img_0_andrew-trickortreating-aspanda-4_600px.webp


 47%|████▋     | 534/1134 [05:12<00:18, 32.33it/s]

filtrated: issue_121_title_a_message_fromdeeplearningai_img_0_the-batch-s12n.webp


 54%|█████▍    | 613/1134 [05:15<00:35, 14.79it/s]

filtrated: issue_138_title_a_message_fromdeeplearningai_img_0_the-batch-image--3-.png


 54%|█████▍    | 618/1134 [05:16<00:42, 12.02it/s]

filtrated: issue_139_title_a_message_fromdeeplearningai_img_0_the-batch-image--4-.png


 55%|█████▍    | 622/1134 [05:17<00:51,  9.93it/s]

filtrated: issue_140_title_a_message_fromdeeplearningai_img_0_428_the-batch-image.png


 56%|█████▌    | 632/1134 [05:20<01:29,  5.63it/s]

filtrated: issue_142_title_bridge_to_explainable_ai_img_0_bridge.webp


 56%|█████▋    | 638/1134 [05:21<01:56,  4.26it/s]

filtrated: issue_143_title_a_message_fromdeeplearningai_img_0_ai-x-518_the-batch-image.png
filtrated: issue_143_title_managing_medical_uncertainty_img_0_hospitals-1.webp


 57%|█████▋    | 641/1134 [05:21<01:18,  6.28it/s]

filtrated: issue_144_title_gpt-free_img_0_meta.webp


 57%|█████▋    | 649/1134 [05:23<01:59,  4.05it/s]

filtrated: issue_145_title_when_data__danger_img_0_locationetcdata.webp


 57%|█████▋    | 652/1134 [05:23<01:25,  5.66it/s]

filtrated: issue_146_title_linear_regression_straight__narrow_img_0_linearregression_carweight-milege_1200px.webp
filtrated: issue_146_title_logistic_regression_follow_the_curve_img_0_logisticregression_tumbler_1200px.webp
filtrated: issue_146_title_gradient_descent_its_all_downhill_img_0_heroes-mountainpaths-gullies_1200px-1.webp
filtrated: issue_146_title_a_message_fromdeeplearningai_img_0_deeplearningai_banner_stanford_teaser_1200x628_artboard-2--1-.webp
filtrated: issue_146_title_neural_networks_find_the_function_img_0_admiralperceptron_1200px.webp
filtrated: issue_146_title_decision_trees_from_root_to_leaves_img_0_decisiontree_1200px.webp
filtrated: issue_146_title_k-means_clustering_group_think_img_0_k-means_3clusters_1200px_crop-2.webp
filtrated: issue_147_title_actors_act_against_ai_img_0_ezgif.webp


 58%|█████▊    | 661/1134 [05:24<00:29, 15.79it/s]

filtrated: issue_147_title_a_message_fromdeeplearningai_img_0_intro-to-mlops-cover--1-.webp
filtrated: issue_147_title_deep_learning_for_deep_discounts_img_0_upside.webp


 59%|█████▉    | 668/1134 [05:25<00:56,  8.23it/s]

filtrated: issue_148_title_a_message_fromdeeplearningai_img_0_pie---ai-amabssador-soptlight_the-batch-brigita.png


 60%|█████▉    | 678/1134 [05:27<01:14,  6.09it/s]

filtrated: issue_150_title_wind_in_the_forecast_img_0_wind.webp
filtrated: issue_150_title_a_message_fromdeeplearningai_img_0_pie---ai-amabssador-soptlight_the-batch-emilio.png


 61%|██████    | 691/1134 [05:31<02:18,  3.19it/s]

filtrated: issue_153_title_a_message_fromdeeplearningai_img_0_accelerating-your-ai-careermls3-aug_the-batch-image.png


 61%|██████▏   | 695/1134 [05:31<01:45,  4.17it/s]

filtrated: issue_154_title_keep_your_ais_on_the_road_img_0_eu.webp


 63%|██████▎   | 711/1134 [05:35<02:00,  3.52it/s]

filtrated: issue_158_title_a_message_fromdeeplearningai_img_0_ai-for-social-good-banner.webp


 63%|██████▎   | 716/1134 [05:36<01:53,  3.68it/s]

filtrated: issue_159_title_a_message_fromdeeplearningai_img_0_mls_learner_1200x628_a-1_artboard-1-copy-11-1.webp


 65%|██████▌   | 741/1134 [05:44<01:56,  3.36it/s]

filtrated: issue_165_title_a_message_fromour_partner_img_0_fourthbrain-banner-ad--1-.png
filtrated: issue_165_title_the_dark_side_of_the_moon__lit_up_img_0_moon_600px.webp


 66%|██████▋   | 753/1134 [05:46<01:07,  5.62it/s]

filtrated: issue_167_title_a_message_fromdeeplearningai_img_0_mls-thebatch-ad.png


 75%|███████▍  | 846/1134 [06:09<00:47,  6.00it/s]

filtrated: issue_184_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners.png


 75%|███████▍  | 850/1134 [06:09<01:01,  4.65it/s]

filtrated: issue_185_title_a_message_from_dataheroes_img_0_the-batch-ads-and-exclusive-banners.gif


 76%|███████▌  | 862/1134 [06:12<01:00,  4.47it/s]

filtrated: issue_187_title_a_message_from_workera_img_0_the-batch-ads-and-exclusive-banners--9-.png


 77%|███████▋  | 876/1134 [06:15<00:32,  7.90it/s]

filtrated: issue_189_title_a_message_fromdeeplearningai_img_0_323_the-batch-image.png


 78%|███████▊  | 880/1134 [06:15<00:37,  6.79it/s]

filtrated: issue_190_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--12-.png


 78%|███████▊  | 886/1134 [06:17<00:50,  4.90it/s]

filtrated: issue_191_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--7-.jpg


 79%|███████▉  | 894/1134 [06:18<00:33,  7.27it/s]

filtrated: issue_194_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--9-.jpg


 79%|███████▉  | 898/1134 [06:19<00:59,  3.97it/s]

filtrated: issue_195_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--11-.jpg


 80%|███████▉  | 904/1134 [06:20<00:41,  5.56it/s]

filtrated: issue_196_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--17-.png


 80%|████████  | 910/1134 [06:22<00:56,  3.95it/s]

filtrated: issue_197_title_a_message_fromworkera_img_0_the-batch-ads-and-exclusive-banners--12---1-.jpg


 81%|████████  | 916/1134 [06:23<00:42,  5.15it/s]

filtrated: issue_198_title_a_message_from_deeplearningai_img_0_the-batch-ads-and-exclusive-banners--23-.png


 81%|████████▏ | 922/1134 [06:24<00:37,  5.62it/s]

filtrated: issue_199_title_a_message_fromdeeplearningai_img_0_new-courses-batch.png


 82%|████████▏ | 928/1134 [06:25<00:34,  6.02it/s]

filtrated: issue_200_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--30-.png


 82%|████████▏ | 934/1134 [06:26<00:36,  5.43it/s]

filtrated: issue_201_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--33-.png


 83%|████████▎ | 940/1134 [06:27<00:27,  7.17it/s]

filtrated: issue_202_title_a_message_from_landing_ai_img_0_the-batch-ads-and-exclusive-banners--35-.png


 84%|████████▎ | 948/1134 [06:28<00:23,  7.95it/s]

filtrated: issue_203_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--36-.png


 84%|████████▍ | 954/1134 [06:29<00:30,  5.84it/s]

filtrated: issue_204_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--41-.png


 85%|████████▌ | 967/1134 [06:33<00:42,  3.89it/s]

filtrated: issue_207_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--43-.png


 86%|████████▌ | 971/1134 [06:33<00:37,  4.40it/s]

filtrated: issue_208_title_a_message_from_deeplearningai_img_0_the-batch-ads-and-exclusive-banners--45-.png


 86%|████████▋ | 980/1134 [06:35<00:23,  6.55it/s]

filtrated: issue_209_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--49-.png


 87%|████████▋ | 984/1134 [06:36<00:27,  5.54it/s]

filtrated: issue_210_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--48-.png


 87%|████████▋ | 987/1134 [06:37<00:39,  3.76it/s]

filtrated: issue_210_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--50-.png


 87%|████████▋ | 991/1134 [06:37<00:27,  5.17it/s]

filtrated: issue_211_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--51-.png


 88%|████████▊ | 1002/1134 [06:39<00:28,  4.64it/s]

filtrated: issue_213_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--53-.png


 89%|████████▉ | 1007/1134 [06:40<00:25,  5.07it/s]

filtrated: issue_214_title_a_message_from_speechlab_img_0_the-batch-ads-and-exclusive-banners--63-.png


 90%|████████▉ | 1020/1134 [06:43<00:23,  4.95it/s]

filtrated: issue_217_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--67-.png


 90%|█████████ | 1026/1134 [06:44<00:27,  3.99it/s]

filtrated: issue_218_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--68-.png


 91%|█████████ | 1034/1134 [06:45<00:14,  7.08it/s]

filtrated: issue_219_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--70-.png


 92%|█████████▏| 1039/1134 [06:46<00:18,  5.02it/s]

filtrated: issue_220_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--71-.png


 92%|█████████▏| 1048/1134 [06:47<00:12,  7.07it/s]

filtrated: issue_221_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--72-.png


 93%|█████████▎| 1055/1134 [06:48<00:09,  8.57it/s]

filtrated: issue_223_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--80-.png


 93%|█████████▎| 1059/1134 [06:49<00:07, 10.22it/s]

filtrated: issue_223_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners-_72_.jpg


 94%|█████████▎| 1061/1134 [06:49<00:09,  7.49it/s]

filtrated: issue_224_title_a_message_fromdeeplearningai_img_0_the-batch--2-.png


 94%|█████████▍| 1068/1134 [06:50<00:10,  6.22it/s]

filtrated: issue_225_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--83-.png


 96%|█████████▌| 1087/1134 [06:53<00:07,  6.54it/s]

filtrated: issue_229_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--93-.png


 96%|█████████▋| 1093/1134 [06:54<00:07,  5.51it/s]

filtrated: issue_230_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners--95-.png


 98%|█████████▊| 1106/1134 [07:04<00:15,  1.86it/s]

filtrated: issue_232_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners---2024-01-16t090702307.png


 98%|█████████▊| 1114/1134 [07:05<00:03,  5.18it/s]

filtrated: issue_233_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners---2024-01-23t083847050.png


 99%|█████████▊| 1118/1134 [07:06<00:03,  4.79it/s]

filtrated: issue_234_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners---2024-01-30t091637902.png


 99%|█████████▉| 1126/1134 [07:07<00:01,  7.12it/s]

filtrated: issue_238_title_blazing_inference_speed_img_0_groq-llmperf.webp
filtrated: issue_238_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners---2024-02-20t090733220.png


100%|█████████▉| 1132/1134 [07:08<00:00,  6.76it/s]

filtrated: issue_239_title_a_message_fromdeeplearningai_img_0_the-batch-ads-and-exclusive-banners---2024-03-05t085620866.png


100%|██████████| 1134/1134 [07:08<00:00,  2.64it/s]


# Web scraping

In [7]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json

from config import Config

def setup_driver(headless=False):
    options = Options()
    if headless:
        options.add_argument('--headless=new')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options)
    return driver

def extract_issue_data(driver, issue_number):
    url = f"https://www.deeplearning.ai/the-batch/issue-{issue_number}/"
    print(url)
    driver.get(url)
    time.sleep(5)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    main_content = soup.find("div", class_="prose--styled")

    if not main_content:
        print(f"❌ No main content found for issue {issue_number}")
        return None

    news_anchor = main_content.find(lambda tag: tag.name in ["h1", "h2"] and tag.get("id") == "news")

    if not news_anchor:
        print(f"⚠️ No 'news' anchor found in issue {issue_number}, get all")
        elements_to_process = main_content
    else:
        elements_to_process = list(news_anchor.next_siblings)

    articles_data = []
    current_article = {
        "issue": issue_number,
        "url": url,
        "title": None,
        "text": "",
        "images": [],
        "captions": []
    }

    for element in elements_to_process:
        if isinstance(element, str):
            continue

        if element.name in ["h1", "h2", "h3"]:
            current_article["title"] = element.get_text(strip=True)

        elif element.name == "p":
            text = element.get_text(strip=True)
            if text:
                current_article["text"] += text + "\n"

        elif element.name == "ul":
            for li in element.find_all("li"):
                li_text = li.get_text(strip=True)
                if li_text:
                    current_article["text"] += "• " + li_text + "\n"

        elif element.name == "figure":
            img_tag = element.find("img")
            if img_tag:
                img_src = img_tag.get("src")
                if img_src:
                    current_article["images"].append(img_src)

            caption_tag = element.find("figcaption")
            if caption_tag:
                caption = caption_tag.get_text(strip=True)
            else:
                # fallback — сусідній <p>
                next_p = element.find_next_sibling("p")
                caption = next_p.get_text(strip=True) if next_p else ""

            current_article["captions"].append(caption)

        elif element.name == "hr":
            if current_article["title"] is not None or current_article["text"] is not None:
                if 'A MESSAGE FROM' not in current_article["title"]:
                    articles_data.append(current_article)
                current_article = {
                    "issue": issue_number,
                    "url": url,
                    "title": None,
                    "text": "",
                    "images": [],
                    "captions": []
                }

    if current_article["title"] or current_article["text"]:
        articles_data.append(current_article)

    return articles_data

def scrape_all_issues(start=1, end=239, headless=True):
    driver = setup_driver(headless)
    all_issues = []

    for issue_number in range(start, end + 1, 1):
        try:
            print(f"Scraping Issue #{issue_number}")
            data = extract_issue_data(driver, issue_number)
            all_issues.extend(data)
        except Exception as e:
            print(f"Failed to scrape issue {issue_number}: {e}")
            continue

    driver.quit()
    return all_issues


results = scrape_all_issues(start=39, end=39)

print("✅ Scraping complete. Data saved to 'the_batch_articles.json'")
results

Scraping Issue #39
https://www.deeplearning.ai/the-batch/issue-39/
✅ Scraping complete. Data saved to 'the_batch_articles.json'


[{'issue': 39,
  'url': 'https://www.deeplearning.ai/the-batch/issue-39/',
  'title': 'Mask Monitor',
  'text': 'Cameras that detect face masks are helping French authorities to evaluate citizens’ adherence to government mandates intended to fight Covid-19.What’s new:Starting this week, everyone riding public transportation in France isrequiredto wear a face mask.Paris and Cannesare using computer vision to count people who comply.How it works:Datakalab, a French AI startup, is installing chips in existing CCTV cameras that run an object recognition model. The model is trained to distinguish masked faces from unmasked ones.\n• Paris istestingthe cameras at the busy Chatelet-Les Halles metro station. Cannes has installed them on buses and in public markets.\n• The software counts mask wearers every 15 minutes and transmits aggregate statistics to the authorities. The companysaysthe system is meant to help authorities determine where to step up efforts to promote mask-wearing\n• Datakala