### Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.robotparser
import time
import random
from collections import defaultdict
from urllib.parse import urlparse
import os
import json
import re

### Spliting CNN and Guardian urls

In [5]:

with open('../data/database/database.json', 'r', encoding='utf-8', errors='ignore') as f:
    content = f.read()

# Remove control characters (ASCII < 32 except for newline and tab)
content = re.sub(r'(?<!\\)[\x00-\x1F]', ' ', content)
database = json.loads(content)

cnn_articles = [article_id for article_id in database.keys() if 'https://www.cnn.com' in database[article_id]['url']]
guardian_articles = [article_id for article_id in database.keys() if 'https://www.theguardian.com' in database[article_id]['url']]

print(f"Number of urls from CNN: {len(cnn_articles)}")
print(f"Number of urls from Guardian: {len(guardian_articles)}")

Number of urls from CNN: 24200
Number of urls from Guardian: 178603


### Crawling CNN's captions

- The captions of the images are crawled, if the image has no caption, its alt-text is crawled instead.
- For the alt-texts, there will be a token `<alt>` at the end.

### Instruction:
- Adjust the start_index and end_index 
- start_index = last end_index
- Recommend: `end_index - start_index <= 2000`
- The file will be saved with format: `database_{start_index}_{end_index}.json` in `database_with_captions`
- If there is error during running, rename manually the file `database_modified.json` to the right name (adjust the end_index to fit with the progress log)

In [1]:
# [To Modify] the start index and end index of the urls
start_index = 13000
end_index = 15000

In [4]:
database_modified = defaultdict(dict)

def send_requests_to_urls(article_ids, start_idx, end_idx, user_agent="MyNewsBot", progress_step=50):
    print(f"Starting crawling captions with from index {start_idx} to index {end_idx}")
    headers = {"User-Agent": user_agent}
    failed_requests = 0
    robot_parser = urllib.robotparser.RobotFileParser()

    N = len(article_ids)
    cnt = start_idx
    # Retrieve and read the robots.txt file for each website
    for article_id in article_ids[start_idx:end_idx]:

        database_modified[article_id] = database[article_id].copy()

        url = database[article_id]['url']
        robots_url = f"{url.split('/')[0]}//{url.split('/')[2]}/robots.txt"
        robot_parser.set_url(robots_url)
        robot_parser.read()
        cnt += 1

        if not robot_parser.can_fetch(user_agent, url):
            print(f"❌ BLOCKED by robots.txt - {url}")
            continue

        try:
            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                main_content = soup.find("main", class_="article__main")
                captions = []

                if main_content:
                    for image_block in main_content.find_all('div', class_='image'):
                        if image_block.find_parent('a'):
                            continue

                        if image_block.find_parent(class_= 'video-resource__image'):
                            continue

                        caption_tag = image_block.find('div', class_='image__caption')
                        caption = caption_tag.get_text(separator=' ', strip=True) if caption_tag else ""

                        if caption == "":
                            img = image_block.find('img')
                            if img:
                                alt_text = img.get('alt', '').strip()
                                if alt_text:
                                    caption = alt_text + ' <alt>'
                        captions.append(caption)

                database_modified[article_id]['captions'] = captions

            elif response.status_code == 429:
                print(f"❌ TOO MANY REQUESTS - {url}. Retrying in 30 seconds...")
                time.sleep(30)
                failed_requests += 1

            elif response.status_code != 200:
                print(f"❌ ERROR - Status Code: {response.status_code} - {url}")
                failed_requests += 1

        except requests.exceptions.RequestException as e:
            print(f"❌ REQUEST FAILED - {url} due to {e}")
            failed_requests += 1

        if cnt % progress_step == 0:
            save_database(database_modified)
            print('[Progress INFO] Processed %d/%d (%.2f%% done)' % (cnt, N, cnt*100.0/N))

        time.sleep(random.uniform(0.1, 0.3))

    print(f"Finished scraping to index {end_index} with {failed_requests} failed requests!")

def save_database(database, fileroot='database_with_captions', filename='database_modified.json'):
    filename = os.path.join(fileroot, filename)
    with open(filename, 'w') as f:
        json.dump(database, f, indent=4)

def rename_file(filepath, new_filepath):
    os.rename(filepath, new_filepath)

# Save the modified database after chunks of progress_step urls.
send_requests_to_urls(cnn_articles, start_idx=start_index, end_idx=end_index, progress_step=50)

# Rename database file with start_index and end_index
db_path = os.path.join('database_with_captions', 'database_modified.json')
new_dp_path = os.path.join('database_with_captions', 'database_') + f'{start_index}_{end_index}.json'
rename_file(db_path, new_dp_path)

Starting crawling captions with from index 13000 to index 15000
[Progress INFO] Processed 13050/24200 (53.93% done)
[Progress INFO] Processed 13100/24200 (54.13% done)
[Progress INFO] Processed 13150/24200 (54.34% done)
[Progress INFO] Processed 13200/24200 (54.55% done)
[Progress INFO] Processed 13250/24200 (54.75% done)
[Progress INFO] Processed 13300/24200 (54.96% done)
[Progress INFO] Processed 13350/24200 (55.17% done)
[Progress INFO] Processed 13400/24200 (55.37% done)
[Progress INFO] Processed 13450/24200 (55.58% done)
[Progress INFO] Processed 13500/24200 (55.79% done)
[Progress INFO] Processed 13550/24200 (55.99% done)
[Progress INFO] Processed 13600/24200 (56.20% done)
[Progress INFO] Processed 13650/24200 (56.40% done)
[Progress INFO] Processed 13700/24200 (56.61% done)
[Progress INFO] Processed 13750/24200 (56.82% done)
[Progress INFO] Processed 13800/24200 (57.02% done)
[Progress INFO] Processed 13850/24200 (57.23% done)
[Progress INFO] Processed 13900/24200 (57.44% done)


In [5]:
# Check the size of the result

path = os.path.join('database_with_captions', 'database_') + f'{start_index}_{end_index}.json'
with open(path, 'r') as f:
    print(f'database path: {path}')
    tmp_database = json.load(f)
    print(len(tmp_database))

database path: database_with_captions\database_13000_15000.json
2000


In [None]:
# Check the size of the result

path = os.path.join('database_with_captions', 'database_modified.json')
with open(path, 'r') as f:
    print(f'database path: {path}')
    tmp_database = json.load(f)
    print(len(tmp_database))

In [10]:
database_folder = 'database_with_captions'
database_cnn_folder = 'database_cnn'
database_cnn_dir = os.path.join(database_cnn_folder, 'database_cnn.json')

database_cnn = {}

for db_name in os.listdir(database_folder):
    db_path = os.path.join(database_folder, db_name)
    with open(db_path, 'r') as f:
        db = json.load(f)
        database_cnn.update(db)

print(len(database_cnn))
with open(database_cnn_dir, 'w') as f:
    json.dump(database_cnn, f, indent=4)



15000


### Crawling Guardian's captions

In [33]:
start_index = 0
end_index = 50

In [None]:
database_modified = defaultdict(dict)

def send_requests_to_urls(article_ids, start_idx, end_idx, user_agent="MyNewsBot", progress_step=50):
    print(f"Starting crawling captions with from index {start_idx} to index {end_idx}")
    headers = {"User-Agent": user_agent}
    failed_requests = 0
    robot_parser = urllib.robotparser.RobotFileParser()

    N = len(article_ids)
    cnt = start_idx
    # Retrieve and read the robots.txt file for each website
    for article_id in article_ids[start_idx:end_idx]:

        database_modified[article_id] = database[article_id].copy()

        url = database[article_id]['url']
        robots_url = f"{url.split('/')[0]}//{url.split('/')[2]}/robots.txt"
        robot_parser.set_url(robots_url)
        robot_parser.read()
        cnt += 1

        if not robot_parser.can_fetch(user_agent, url):
            print(f"❌ BLOCKED by robots.txt - {url}")
            continue

        try:
            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                main_content = soup.find("main")
                captions = []

                if main_content:

                    image_blocks = main_content.find_all('div', class_= 'dcr-1t8m8f2')

                    if not image_blocks:
                        image_blocks = main_content.find_all('div', class_ = 'dcr-hlfdy3')

                    for image_block in image_blocks:
                        if image_block.find_parent('a'):
                            continue

                        if image_block.find_parent(class_= 'video-resource__image'):
                            continue

                        caption_tag = image_block.find('span', class_='dcr-1qvd3m6')

                        # print(caption_tag)
                        if not caption_tag:
                            parent = image_block.find_parent()
                            # print(parent)
                            caption_tag = parent.find('span', class_='dcr-1qvd3m6')

                        caption = caption_tag.get_text(separator=' ', strip=True) if caption_tag else ""

                        if caption == "":
                            img = image_block.find('img')
                            if img:
                                alt_text = img.get('alt', '').strip()
                                if alt_text:
                                    caption = alt_text + ' <alt>'
                        captions.append(caption)

                database_modified[article_id]['captions'] = captions

            elif response.status_code == 429:
                print(f"❌ TOO MANY REQUESTS - {url}. Retrying in 30 seconds...")
                time.sleep(30)
                failed_requests += 1

            elif response.status_code != 200:
                print(f"❌ ERROR - Status Code: {response.status_code} - {url}")
                failed_requests += 1

        except requests.exceptions.RequestException as e:
            print(f"❌ REQUEST FAILED - {url} due to {e}")
            failed_requests += 1

        if cnt % progress_step == 0:
            save_database(database_modified)
            print('[Progress INFO] Processed %d/%d (%.2f%% done)' % (cnt, N, cnt*100.0/N))

        time.sleep(random.uniform(0.1, 0.2))

    print(f"Finished scraping to index {end_index} with {failed_requests} failed requests!")

def save_database(database, fileroot='database_with_captions_guardian', filename='database_modified.json'):
    filename = os.path.join(fileroot, filename)
    with open(filename, 'w') as f:
        json.dump(database, f, indent=4)

def rename_file(filepath, new_filepath):
    os.rename(filepath, new_filepath)

# Save the modified database after chunks of progress_step urls.
send_requests_to_urls(guardian_articles, start_idx=start_index, end_idx=end_index, progress_step=50)

# Rename database file with start_index and end_index
db_path = os.path.join('database_with_captions_guardian', 'database_modified.json')
new_dp_path = os.path.join('database_with_captions_guardian', 'database_') + f'{start_index}_{end_index}.json'
rename_file(db_path, new_dp_path)

Starting crawling captions with from index 0 to index 50
[Progress INFO] Processed 1/178603 (0.00% done)
[Progress INFO] Processed 2/178603 (0.00% done)
[Progress INFO] Processed 3/178603 (0.00% done)
[Progress INFO] Processed 4/178603 (0.00% done)
[Progress INFO] Processed 5/178603 (0.00% done)
[Progress INFO] Processed 6/178603 (0.00% done)
[Progress INFO] Processed 7/178603 (0.00% done)
[Progress INFO] Processed 8/178603 (0.00% done)
[Progress INFO] Processed 9/178603 (0.01% done)
[Progress INFO] Processed 10/178603 (0.01% done)
[Progress INFO] Processed 11/178603 (0.01% done)
[Progress INFO] Processed 12/178603 (0.01% done)
[Progress INFO] Processed 13/178603 (0.01% done)
[Progress INFO] Processed 14/178603 (0.01% done)
[Progress INFO] Processed 15/178603 (0.01% done)
[Progress INFO] Processed 16/178603 (0.01% done)
[Progress INFO] Processed 17/178603 (0.01% done)
[Progress INFO] Processed 18/178603 (0.01% done)
[Progress INFO] Processed 19/178603 (0.01% done)
[Progress INFO] Proce