In [1]:
import requests

def fetch_robots_txt(url):
    if not url.endswith('/'):
        url += '/'
    robots_url = url + 'robots.txt'
    response = requests.get(robots_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch robots.txt: HTTP {response.status_code}")
        return None

def parse_robots_txt(content):
    rules = {}
    sitemaps = []
    current_user_agent = None

    for line in content.splitlines():
        line = line.strip()
        if not line or line.startswith('#'):
            continue

        if line.lower().startswith('user-agent:'):
            current_user_agent = line.split(':', 1)[1].strip()
            if current_user_agent not in rules:
                rules[current_user_agent] = {'allow': [], 'disallow': []}
        elif line.lower().startswith('allow:') and current_user_agent:
            path = line.split(':', 1)[1].strip()
            rules[current_user_agent]['allow'].append(path)
        elif line.lower().startswith('disallow:') and current_user_agent:
            path = line.split(':', 1)[1].strip()
            rules[current_user_agent]['disallow'].append(path)
        elif line.lower().startswith('sitemap:'):
            sitemap_url = line.split(':', 1)[1].strip()
            sitemaps.append(sitemap_url)

    return rules, sitemaps

if __name__ == "__main__":
    url = "https://www.aljazeera.com"
    content = fetch_robots_txt(url)
    if content:
        rules, sitemaps = parse_robots_txt(content)
        for agent, paths in rules.items():
            print(f"User-agent: {agent}")
            print("  Allow:")
            for p in paths['allow']:
                print(f"    {p}")
            print("  Disallow:")
            for p in paths['disallow']:
                print(f"    {p}")
            print()
        print("Sitemaps found:")
        for sitemap in sitemaps:
            print(f"  {sitemap}")


User-agent: *
  Allow:
    /search/$
  Disallow:
    /api
    /asset-manifest.json
    /search/
    /home/search?q=

User-agent: anthropic-ai
  Allow:
  Disallow:
    /

User-agent: ChatGPT-User
  Allow:
  Disallow:
    /

User-agent: ClaudeBot
  Allow:
  Disallow:
    /

User-agent: Claude-Web
  Allow:
  Disallow:
    /

User-agent: cohere-ai
  Allow:
  Disallow:
    /

User-agent: GPTBot
  Allow:
  Disallow:
    /

User-agent: PerplexityBot
  Allow:
  Disallow:
    /

User-agent: Bytespider
  Allow:
  Disallow:
    /

Sitemaps found:
  https://www.aljazeera.com/sitemap.xml
  https://www.aljazeera.com/news-sitemap.xml
  https://www.aljazeera.com/sitemaps/article-archive.xml
  https://www.aljazeera.com/sitemaps/article-new.xml
  https://www.aljazeera.com/sitemaps/video-archive.xml
  https://www.aljazeera.com/sitemaps/video-new.xml


In [27]:
import requests
import xml.etree.ElementTree as ET
import os

def fetch_sitemap(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch sitemap: HTTP {response.status_code}")
        return None

def parse_sitemap_index(xml_content):
    sitemap_urls = []
    try:
        root = ET.fromstring(xml_content)
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for sitemap in root.findall('ns:sitemap', namespace):
            loc = sitemap.find('ns:loc', namespace)
            if loc is not None:
                sitemap_urls.append(loc.text)
    except ET.ParseError as e:
        print(f"Error parsing sitemap XML: {e}")
    return sitemap_urls

def parse_sitemap(xml_content):
    urls = []
    try:
        root = ET.fromstring(xml_content)
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for url in root.findall('ns:url', namespace):
            loc = url.find('ns:loc', namespace)
            if loc is not None:
                urls.append(loc.text)
    except ET.ParseError as e:
        print(f"Error parsing sitemap XML: {e}")
    return urls

def save_urls_to_txt(urls, filename):
    try:
        with open(filename, "w", encoding="utf-8") as f:
            for url in urls:
                f.write(url + "\n")
        print(f"Saved {len(urls)} URLs to {filename}")
    except Exception as e:
        print(f"Error saving URLs to file: {e}")

if __name__ == "__main__":
    sitemap_index_url = "https://www.aljazeera.com/sitemap.xml"
    sitemap_index_content = fetch_sitemap(sitemap_index_url)

    if sitemap_index_content:
        all_urls = []
        sitemap_urls = parse_sitemap_index(sitemap_index_content)

        if sitemap_urls:
            print(f"Found {len(sitemap_urls)} sitemap URLs in sitemap index.")
            for i, sm_url in enumerate(sitemap_urls):
                print(f"Fetching URLs from sitemap {i+1}: {sm_url}")
                sm_content = fetch_sitemap(sm_url)
                if sm_content:
                    urls = parse_sitemap(sm_content)
                    print(f"Found {len(urls)} URLs in sitemap {i+1}")
                    all_urls.extend(urls)
        else:
            print("This is a regular sitemap (not an index).")
            urls = parse_sitemap(sitemap_index_content)
            all_urls.extend(urls)

        os.makedirs("sitemaps_files", exist_ok=True)
        save_urls_to_txt(all_urls, os.path.join("sitemaps_files", "all_sitemaps_url.txt"))

In [3]:
import requests
import xml.etree.ElementTree as ET

rss_url = "https://www.aljazeera.com/xml/rss/all.xml"

response = requests.get(rss_url)
rss_content = response.content

root = ET.fromstring(rss_content)

channel = root.find('channel')
items = channel.findall('item')

for item in items[:5]:
    title = item.find('title').text
    link = item.find('link').text
    pub_date = item.find('pubDate').text
    print(f"Title: {title}")
    print(f"Link: {link}")
    print(f"Published: {pub_date}")
    print("-" * 50)


Title: Europe’s political centre holds in weekend of elections
Link: https://www.aljazeera.com/video/inside-story/2025/5/19/europes-political-centre-holds-in-weekend-of-elections?traffic_source=rss
Published: Mon, 19 May 2025 19:23:15 +0000
--------------------------------------------------
Title: Former President Bolsonaro’s coup trial opens in Brazil
Link: https://www.aljazeera.com/news/2025/5/19/former-president-bolsonaros-coup-trial-opens-in-brazil?traffic_source=rss
Published: Mon, 19 May 2025 19:22:13 +0000
--------------------------------------------------
Title: British presenter Gary Lineker steps down over anti-Semitism row
Link: https://www.aljazeera.com/program/newsfeed/2025/5/19/british-presenter-gary-lineker-steps-down-over-anti-semitism-row?traffic_source=rss
Published: Mon, 19 May 2025 18:34:29 +0000
--------------------------------------------------
Title: Pope Leo XIV meets US VP JD Vance
Link: https://www.aljazeera.com/program/newsfeed/2025/5/19/pope-leo-xiv-meets-us

In [4]:
import requests

url = "https://www.aljazeera.com/api"
#url = "https://jsonplaceholder.typicode.com/posts"
headers = {
    "User-Agent": "MyCrawler/1.0",
    "Accept": "application/json"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    data = response.json()
    for post in data[:5]:
        print(f"Post ID: {post['id']}, Title: {post['title']}")
else:
    print(f"Failed to get data, status code: {response.status_code}")


Failed to get data, status code: 404


In [5]:
import requests

url = "https://www.aljazeera.com/api/endpoint-specific"  # حط رابط الـ API الصحيح هنا

headers = {
    "User-Agent": "MyCrawler/1.0",
    "Accept": "application/json"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    try:
        data = response.json()
        print(data)
    except ValueError:
        print("Response is not in JSON format.")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")


Failed to fetch data. Status code: 404


In [6]:
import requests
from urllib.robotparser import RobotFileParser

robots_url = "https://www.aljazeera.com/robots.txt"

rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()

# crawl_delay
delay = rp.crawl_delay("*")
print(f"Crawl-delay is: {delay} seconds")


Crawl-delay is: None seconds


In [7]:
!pip install scrapy twisted


Collecting scrapy
  Downloading scrapy-2.13.0-py3-none-any.whl.metadata (5.4 kB)
Collecting twisted
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemadapter>=0.1.0 (from scrapy)
  Downloading itemadapter-0.11.0-py3-none-any.whl.metadata (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting protego>=0.1.15 (from scrapy)
  Downloading Protego-0.4.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting pydispatcher>=2.0.5 (from scrapy)
  Downloading PyDispatcher-2.0.7-py3-none-any.whl.metadata (2.4 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_iden

In [8]:
%%writefile aljazeera_spider.py
import scrapy
import time

class DelayMiddleware:
    def __init__(self, delay=2):
        self.delay = delay

    @classmethod
    def from_crawler(cls, crawler):
        delay = crawler.settings.getfloat('DOWNLOAD_DELAY', 2)
        return cls(delay)

    def process_request(self, request, spider):
        time.sleep(self.delay)

class AljazeeraSpider(scrapy.Spider):
    name = "aljazeera"
    allowed_domains = ["aljazeera.com"]
    start_urls = ["https://www.aljazeera.com/news/"]

    custom_settings = {
        'ROBOTSTXT_OBEY': True,
        'DOWNLOAD_DELAY': 2,
        'DOWNLOADER_MIDDLEWARES': {
            '__main__.DelayMiddleware': 543,
        },
        'USER_AGENT': 'MyCrawlerBot/1.0 (+https://yourdomain.com)'
    }

    def parse(self, response):
        for article in response.css('article'):
            title = article.css('h2 a::text').get()
            link = article.css('h2 a::attr(href)').get()
            if title and link:
                yield {
                    'title': title.strip(),
                    'link': response.urljoin(link)
                }


Writing aljazeera_spider.py


In [9]:
!scrapy runspider aljazeera_spider.py -o results.json


2025-05-19 19:48:30 [scrapy.utils.log] INFO: Scrapy 2.13.0 started (bot: scrapybot)
2025-05-19 19:48:30 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-05-19 19:48:30 [scrapy.addons] INFO: Enabled addons:
[]
2025-05-19 19:48:30 [asyncio] DEBUG: Using selector: EpollSelector
2025-05-19 19:48:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2025-05-19 19:48:30 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop
2025-05-19 19:48:30 [scrapy.extensions.telnet] INFO: Telnet Password: 7fdc3013d7187961
2025-05-19 19:48:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.coresta

In [10]:
!scrapy runspider aljazeera_spider.py -o results.json


2025-05-19 19:48:33 [scrapy.utils.log] INFO: Scrapy 2.13.0 started (bot: scrapybot)
2025-05-19 19:48:33 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-05-19 19:48:33 [scrapy.addons] INFO: Enabled addons:
[]
2025-05-19 19:48:33 [asyncio] DEBUG: Using selector: EpollSelector
2025-05-19 19:48:33 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2025-05-19 19:48:33 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop
2025-05-19 19:48:33 [scrapy.extensions.telnet] INFO: Telnet Password: bf54451ca9a85f1b
2025-05-19 19:48:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.coresta

In [11]:
custom_settings = {
    'DOWNLOAD_DELAY': 2,   # 2 seconds delay
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'MyCrawlerBot/1.0 (+https://yourdomain.com)'
}


In [12]:
import scrapy

class AljazeeraSpider(scrapy.Spider):
    name = 'aljazeera'
    allowed_domains = ['aljazeera.com']
    start_urls = ['https://www.aljazeera.com/xml/rss/all.xml']

    custom_settings = {
        'DOWNLOAD_DELAY': 2,
        'ROBOTSTXT_OBEY': True,
        'USER_AGENT': 'MyCrawlerBot/1.0 (+https://yourdomain.com)'
    }

    def parse(self, response):
        for article in response.css('article'):
            title = article.css('h2 a::text').get()
            link = article.css('h2 a::attr(href)').get()
            if title and link:
                yield {
                    'title': title.strip(),
                    'link': response.urljoin(link)
                }
        pass


In [13]:
!scrapy runspider aljazeera_spider.py -o results.json



2025-05-19 19:48:58 [scrapy.utils.log] INFO: Scrapy 2.13.0 started (bot: scrapybot)
2025-05-19 19:48:58 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-05-19 19:48:58 [scrapy.addons] INFO: Enabled addons:
[]
2025-05-19 19:48:58 [asyncio] DEBUG: Using selector: EpollSelector
2025-05-19 19:48:58 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2025-05-19 19:48:58 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop
2025-05-19 19:48:58 [scrapy.extensions.telnet] INFO: Telnet Password: 83233fdca0c69ba7
2025-05-19 19:48:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.coresta

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

In [15]:
import requests
import time
from bs4 import BeautifulSoup

URL_list = [
    "https://www.aljazeera.com/news/",
    "https://www.aljazeera.com/economy/",
    "https://www.aljazeera.com/sports/",
    "https://www.aljazeera.com/videos/",
    "https://www.aljazeera.com/tag/human-rights/"
    #'https://finance.yahoo.com/quote/7545.T/?p=7545.T',
    #'https://finance.yahoo.com/quote/7447.T/?p=7447.T'

]

page_titles = []
section_names = []

j = 0
while j < len(URL_list):
    try:
        time.sleep(2)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }

        r = requests.get(URL_list[j], headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')

        title = soup.title.text.strip()
        page_titles.append(title)

        section_name = URL_list[j].split("/")[-2]
        section_names.append(section_name)

        print(j, "-->", title)
        print('Done ')
        j += 1

    except Exception as e:
        page_titles.append("N/A")
        section_names.append(URL_list[j].split("/")[-2])
        print(j, "--> failed ")
        print("Error:", e)
        j += 1


0 --> News | Today's latest from Al Jazeera
Done 
1 --> Economy | Today's latest from Al Jazeera
Done 
2 --> Sport | Today's latest from Al Jazeera
Done 
3 --> Video | Today's latest from Al Jazeera
Done 
4 --> Human Rights | Today's latest from Al Jazeera
Done 


In [16]:
import time
import requests
from urllib.robotparser import RobotFileParser

#url = "https://www.aljazeera.com/search/article"
url = "https://www.aljazeera.com/news/"
user_agent = "MyBot/1.0"

robots_url = "https://www.aljazeera.com/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()

if rp.can_fetch(user_agent, url):
    delay = rp.crawl_delay(user_agent)
    if delay is None:
        delay = 2

    print(f"Crawling {url} with delay {delay}s")

    response = requests.get(url, headers={"User-Agent": user_agent})
    if response.status_code == 200:
        print("Page fetched successfully")

    else:
        print(f"Failed to fetch page: {response.status_code}")

    time.sleep(delay)
else:
    print("Fetching disallowed by robots.txt")


Crawling https://www.aljazeera.com/news/ with delay 2s
Page fetched successfully


In [18]:
!pip install selenium fake-useragent webdriver-manager
!apt-get update > /dev/null
!apt install chromium-chromedriver > /dev/null


Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31

In [24]:
# ------------------ STEP 1: Install dependencies ------------------
!pip install selenium fake-useragent webdriver-manager > /dev/null
!apt-get update > /dev/null
!apt install chromium-chromedriver > /dev/null

# ------------------ STEP 2: Setup environment ------------------
import sys
import os
import time
import csv
import re
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent

os.environ['PATH'] += ':/usr/lib/chromium-browser:/usr/bin/chromedriver'

# ------------------ STEP 3: Utility Functions ------------------
def get_user_agent():
    try:
        ua = UserAgent()
        return ua.random
    except:
        return "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"

def get_selenium_options():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-extensions')
    options.add_argument('--window-size=1920x1080')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument(f'user-agent={get_user_agent()}')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    return options

def remove_webdriver_flag(driver):
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
    })

def scroll_down(driver, scroll_pause=2, max_scrolls=5):
    for i in range(max_scrolls):
        print(f"🌀 Scrolling page ({i+1}/{max_scrolls})...")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause)

def click_load_more(driver, max_clicks=3):
    for i in range(max_clicks):
        try:
            load_more = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//button[contains(text(),"المزيد")]'))
            )
            print(f"🔘 Clicking 'Load More' button ({i+1}/{max_clicks})")
            driver.execute_script("arguments[0].scrollIntoView();", load_more)
            load_more.click()
            time.sleep(3)
        except:
            print("ℹ No more 'Load More' button or failed to click.")
            break

def extract_article_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    article_links = []
    for link in soup.select("article a"):
        href = link.get("href")
        if href and href.startswith("/"):
            article_links.append("https://www.aljazeera.com" + href)
    return list(set(article_links))

def save_links_to_csv(links, page_title, filename):
    try:
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Page Title", page_title])
            writer.writerow(["URL"])
            for link in links:
                writer.writerow([link])
        print(f"📥 Saved {len(links)} article links to '{filename}'")
    except Exception as e:
        print("⚠️ Error while saving file:", str(e))

def selenium_scrape_with_retry(url, max_retries=3):
    options = get_selenium_options()
    for attempt in range(1, max_retries + 1):
        print(f"🔁 Attempt {attempt} to load: {url}")
        driver = webdriver.Chrome(service=Service(), options=options)
        remove_webdriver_flag(driver)
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.title_contains("Al Jazeera"))
            page_title = driver.title
            print("📄 Page title after load:", page_title)
            scroll_down(driver)
            click_load_more(driver)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "article"))
            )
            print("✅ Page fully loaded with JS content")
            content = driver.page_source
            article_links = extract_article_links(content)

            # Debug output
            print(f"🔗 Found {len(article_links)} article links.")
            for link in article_links:
                print(link)

            # Save links
            save_path = "/content/aljazeera_article_links_scraped.csv"
            save_links_to_csv(article_links, page_title, filename=save_path)
            driver.quit()
            return article_links
        except Exception as e:
            print(f"❌ Error during attempt {attempt}: {e}")
            driver.quit()
            if attempt < max_retries:
                print("⏳ Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print("🚫 Maximum retry attempts reached. Exiting.")
                return []

# ------------------ STEP 4: Requests Part ------------------
BASE_URL = "https://www.aljazeera.com"
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount("http://", adapter)
session.mount("https://", adapter)

def scrape_article_details(url):
    response = session.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    date_tag = soup.find("time")
    date = date_tag.get_text(strip=True) if date_tag else "No date"

    author_tag = soup.find(lambda tag: tag.name in ['span', 'div'] and 'author' in tag.get('class', []))
    author = author_tag.get_text(strip=True) if author_tag else "No author"

    paragraphs = soup.find_all("p")
    article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])
    article_text = article_text[:50] + "..." if len(article_text) > 200 else article_text

    return {
        "url": url,
        "title": title,
        "date": date,
        "author": author,
        "text": article_text
    }

# ------------------ STEP 5: Run the whole thing ------------------
def main():
    start_url = "https://www.aljazeera.com"
    print("Starting Selenium scraping of main page to get article links...")
    selenium_links = selenium_scrape_with_retry(start_url)

    print("Total article links found by Selenium:", len(selenium_links))

    articles_data = []
    seen_links = set()
    MAX_ARTICLES = 300

    for link in selenium_links:
        if link in seen_links:
            continue
        if len(articles_data) >= MAX_ARTICLES:
            break
        print(f"Scraping article details: {link}")
        try:
            article = scrape_article_details(link)
            articles_data.append(article)
            seen_links.add(link)
            time.sleep(1)
        except Exception as e:
            print(f"Failed to scrape {link}: {e}")

    print(f"Total articles scraped: {len(articles_data)}")

    output_filename = "/content/aljazeera_detailed_articles.csv"
    keys = ["url", "title", "date", "author", "text"]
    try:
        with open(output_filename, "w", newline="", encoding="utf-8") as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(articles_data)
        print(f"Saved detailed articles to {output_filename}")
    except Exception as e:
        print(f"Error saving detailed articles file: {e}")

# Run it
main()



W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Starting Selenium scraping of main page to get article links...
🔁 Attempt 1 to load: https://www.aljazeera.com
📄 Page title after load: Breaking News, World News and Video from Al Jazeera
🌀 Scrolling page (1/5)...
🌀 Scrolling page (2/5)...
🌀 Scrolling page (3/5)...
🌀 Scrolling page (4/5)...
🌀 Scrolling page (5/5)...
ℹ No more 'Load More' button or failed to click.
✅ Page fully loaded with JS content
🔗 Found 86 article links.
https://www.aljazeera.com/features/longform/2025/5/19/a-fathers-fight-to-find-out-what-happened-to-his-son-who-joined-isis
https://www.aljazeera.com/news/2025/5/18/pope-leo-xiv-meets-ukraines-zelenskyy-after-his-inaugural-mass
https://www.aljazeera.com/gallery/2025/5/19/dr-congos-coltan-miners-struggle-as-they-dig-to-feed-worlds-tech
https://www.aljazeera.com/news/liveblog/2025

In [25]:
from google.colab import files
files.download('/content/aljazeera_article_links_scraped.csv')
files.download('/content/aljazeera_detailed_articles.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>