### Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.robotparser
import time
import random
from collections import defaultdict
from urllib.parse import urlparse
import os
import json
import re

### Spliting CNN and Guardian urls

In [2]:

with open('database.json', 'r', encoding='utf-8', errors='ignore') as f:
    content = f.read()

# Remove control characters (ASCII < 32 except for newline and tab)
content = re.sub(r'(?<!\\)[\x00-\x1F]', ' ', content)
database = json.loads(content)

cnn_articles = [article_id for article_id in database.keys() if 'https://www.cnn.com' in database[article_id]['url']]
guardian_articles = [article_id for article_id in database.keys() if 'https://www.theguardian.com' in database[article_id]['url']]

print(f"Number of urls from CNN: {len(cnn_articles)}")
print(f"Number of urls from Guardian: {len(guardian_articles)}")

Number of urls from CNN: 24200
Number of urls from Guardian: 178603


### Crawling CNN's captions

- The captions of the images are crawled, if the image has no caption, its alt-text is crawled instead.
- For the alt-texts, there will be a token `<alt>` at the end.

### Instruction:
- Adjust the start_index and end_index 
- start_index = last end_index
- Recommend: `end_index - start_index <= 2000`
- The file will be saved with format: `database_{start_index}_{end_index}.json` in `database_with_captions`
- If there is error during running, rename manually the file `database_modified.json` to the right name (adjust the end_index to fit with the progress log)

In [3]:
# [To Modify] the start index and end index of the urls
start_index = 4000
end_index = 4200

In [6]:

database_modified = defaultdict(dict)

def send_requests_to_urls(article_ids, start_idx, end_idx, user_agent="MyNewsBot", progress_step=50):
    headers = {"User-Agent": user_agent}
    failed_requests = 0
    robot_parser = urllib.robotparser.RobotFileParser()

    N = len(article_ids[start_idx:end_idx])
    cnt = 0
    # Retrieve and read the robots.txt file for each website
    for article_id in article_ids[start_idx:end_idx]:

        database_modified[article_id] = database[article_id].copy()

        url = database[article_id]['url']
        robots_url = f"{url.split('/')[0]}//{url.split('/')[2]}/robots.txt"
        robot_parser.set_url(robots_url)
        robot_parser.read()
        cnt += 1

        if not robot_parser.can_fetch(user_agent, url):
            print(f"❌ BLOCKED by robots.txt - {url}")
            continue

        try:
            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                main_content = soup.find("main", class_="article__main")
                captions = []

                if main_content:
                    for image_block in main_content.find_all('div', class_='image'):
                        if image_block.find_parent('a'):
                            continue

                        if image_block.find_parent(class_= 'video-resource__image'):
                            continue

                        caption_tag = image_block.find('div', class_='image__caption')
                        caption = caption_tag.get_text(separator=' ', strip=True) if caption_tag else ""

                        if caption == "":
                            img = image_block.find('img')
                            if img:
                                alt_text = img.get('alt', '').strip()
                                if alt_text:
                                    caption = alt_text + ' <alt>'
                        captions.append(caption)

                database_modified[article_id]['captions'] = captions

            elif response.status_code == 429:
                print(f"❌ TOO MANY REQUESTS - {url}. Retrying in 30 seconds...")
                time.sleep(30)
                failed_requests += 1

            elif response.status_code != 200:
                print(f"❌ ERROR - Status Code: {response.status_code} - {url}")
                failed_requests += 1

        except requests.exceptions.RequestException as e:
            print(f"❌ REQUEST FAILED - {url} due to {e}")
            failed_requests += 1

        if cnt % progress_step == 0:
            save_database(database_modified)
            print('[Progress INFO] Processed %d/%d (%.2f%% done)' % (cnt, N, cnt*100.0/N))

        time.sleep(random.uniform(0.2, 0.6))

    print(f"Finished scraping to index {end_index} with {failed_requests} failed requests!")

def save_database(database, fileroot='database_with_captions', filename='database_modified.json'):
    filename = os.path.join(fileroot, filename)
    with open(filename, 'w') as f:
        json.dump(database, f, indent=4)

def rename_file(filepath, new_filepath):
    os.rename(filepath, new_filepath)

# Save the modified database after chunks of progress_step urls.
send_requests_to_urls(cnn_articles, start_idx=start_index, end_idx=end_index, progress_step=50)

# Rename database file with start_index and end_index
db_path = os.path.join('database_with_captions', 'database_modified.json')
new_dp_path = os.path.join('database_with_captions', 'database_') + f'{start_index}_{end_index}.json'
rename_file(db_path, new_dp_path)

Saving results...
[Progress INFO] Processed 4050/200 (2025.00% done)
Saving results...
[Progress INFO] Processed 4100/200 (2050.00% done)
Saving results...
[Progress INFO] Processed 4150/200 (2075.00% done)
Saving results...
[Progress INFO] Processed 4200/200 (2100.00% done)
Finished scraping to index 4200 with 0 failed requests!


In [None]:
# Check the size of the result

path = os.path.join('database_with_captions', 'database_') + f'{start_index}_{end_index}.json'
with open(path, 'r') as f:
    print(f'database path: {path}')
    tmp_database = json.load(f)
    print(len(tmp_database))

### Crawling Guardian's captions

In [None]:
import requests
from bs4 import BeautifulSoup
import urllib.robotparser
import time
import random
from collections import defaultdict
from urllib.parse import urlparse


database_modified = defaultdict(dict)

def send_requests_to_urls(article_ids, start_idx=0, user_agent="MyNewsBot"):
    headers = {"User-Agent": user_agent}
    failed_requests = 0
    robot_parser = urllib.robotparser.RobotFileParser()

    N = len(article_ids)
    cnt = start_idx

    for article_id in article_ids[start_idx:]:
        database_modified[article_id] = database[article_id].copy()
        url = database[article_id]['url']
        robots_url = f"{url.split('/')[0]}//{url.split('/')[2]}/robots.txt"
        # print(f"robots_url: {robots_url}")
        robot_parser.set_url(robots_url)
        robot_parser.read()
        cnt += 1

        # Check if the URL is allowed by robots.txt for the given user-agent
        if not robot_parser.can_fetch(user_agent, url):
            print(f"❌ BLOCKED by robots.txt - {url}")
            continue

        try:
            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                main_content = soup.find("main", class_="article__main")
                captions = []

                if main_content:
                    for image_block in main_content.find_all('div', class_='image'):
                        if image_block.find_parent('a'):
                            continue

                        if image_block.find_parent(class_= 'video-resource__image'):
                            continue

                        caption_tag = image_block.find('div', class_='image__caption')
                        caption = caption_tag.get_text(separator=' ', strip=True) if caption_tag else ""

                        if caption == "":
                            img = image_block.find('img')
                            if img:
                                alt_text = img.get('alt', '').strip()
                                if alt_text:
                                    caption = alt_text + ' <alt>'
                        captions.append(caption)

                database_modified[article_id]['captions'] = captions

            # Handle 429 Too Many Requests (rate-limiting)
            elif response.status_code == 429:
                print(f"❌ TOO MANY REQUESTS - {url}. Retrying in 30 seconds...")
                time.sleep(30)
                failed_requests += 1

            # Handle other errors like 503 (Service Unavailable), 404, etc.
            elif response.status_code != 200:
                print(f"❌ ERROR - Status Code: {response.status_code} - {url}")
                failed_requests += 1

        except requests.exceptions.RequestException as e:
            print(f"❌ REQUEST FAILED - {url} due to {e}")
            failed_requests += 1

        if failed_requests > 0:
            time.sleep(random.uniform(2 ** failed_requests, 10))

        if cnt % 50 == 0:
            print("Saving results...")
            save_database(database_modified)
            print('Processed %d/%d (%.2f%% done)' % (cnt, N, cnt*100.0/N))

        time.sleep(random.uniform(0.5, 1))


    print(f"Finished scraping with {failed_requests} failed requests!")

def save_database(database, filename='database_modified.json'):
    with open(filename, 'w') as f:
        json.dump(database, f, indent=4)

article_ids = []
with open('ArticleIDs/article_ids1.json', 'r', encoding='utf-8') as f:
    content = f.read()

content = re.sub(r'(?<!\\)[\x00-\x1F]', ' ', content)
article_ids = json.loads(content)

send_requests_to_urls(cnn_articles, start_idx=1050)
save_database(database=database_modified)
