In [12]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import time

def extract_all_article_links(url: str) -> Dict[str, Any]:
    """
    Extracts all article links from a given RFA (Radio Free Asia) webpage.

    Args:
    url (str): The URL of the RFA webpage containing article links.

    Returns:
    Dict[str, Any]: A dictionary containing article links and status details.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()
        if end_time - start_time > 50:
            print(f"This URL took more than 50s: {url}")

        soup = BeautifulSoup(response.content, 'html.parser')
        all_articles = soup.find_all("div", class_="teaserimg")
        if not all_articles:
            raise ValueError("Could not find the main article container on the page.")
        
        article_links = []
        for article in all_articles:
            links = article.find_all("a", class_="teaserimg")
            article_links.extend([link.get("href") for link in links if link.get("href")])
        
        final_response["Links"] = article_links
        return final_response
    
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408
        return final_response
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except ValueError as e:
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response


def scrape_rfa_article(url: str) -> Dict[str, Any]:
    """
    Scrapes an article from the RFA (Radio Free Asia) website.

    Args:
    url (str): The URL of the RFA article to scrape.

    Returns:
    Dict[str, Any]: A dictionary containing the scraped information and status details.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': []}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=120)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract title
        title = soup.find('h2', class_='no_media')
        final_response['data']['title'] = title.text.strip() if title else "Title not found"

        # Extracting Meta Data
        meta_data_body = soup.find('div', class_='sectionteaser archive')
        if meta_data_body:
            author_name = meta_data_body.find('a', class_='author')
            final_response['data']['meta_data']["Author"] = author_name.get_text() if author_name else "Author is not listed on RFA"
            
            date_time = meta_data_body.find('span', class_='story_date')
            final_response['data']['meta_data']["Date"] = date_time.get_text() if date_time else "Date not found"
        
        # Getting tag meta data 
        tag_meta = soup.find('ul', class_='tags')
        if tag_meta:
            tag_meta = tag_meta.select('li a')
            final_response['data']['meta_data']["Tags"] = [tag.text for tag in tag_meta]
        
        # Extract body content
        body = soup.find('div', class_='storytext')
        if body:
            paragraphs = body.find_all('p')
            final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]

            # Find the audio tag and get its src attribute
            audio = body.find('audiojs')
            if audio:
                final_response['data']['body']["Audio"] = audio.get('src', "No audio source found")
        
        return final_response
    
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response


def fetch_all_articles(base_url: str, total_pages: int) -> List[str]:
    """
    Fetches article links from all pages of the RFA website.

    Args:
    base_url (str): The base URL of the RFA archive.
    total_pages (int): The total number of pages to scrape.

    Returns:
    List[str]: A list of article URLs.
    """
    all_article_links = []
    for page in range(total_pages):
        url = f"{base_url}&start={page * 13}"
        response = extract_all_article_links(url)
        if response["Response"] == 200:
            all_article_links.extend(response["Links"])
        else:
            print(f"Failed to fetch links from page {page + 1}: {response['Message']}")
    return all_article_links


# Usage
base_archive_url = "https://www.rfa.org/tibetan/story_archive?uids=e2b0f97935844f15a765322387ba2381%40c80eb7880fab441bb93e888d5fb0b8d7%40a34e3f118cfb47c89efc8d10ca29beb4%40ed8b21d453ee4bdbb56632bdf00ba533%404f49bd08f0c44e9384542e0d5f9c37f8%402f794fee5a644fdb9964a3accdff9162%403defd73dd1994e0db35cb48f998d6958%401b63853142714c3489c3e1c3fa848623%406153403d00e24f4a8b178048cc9a1820%4069e4cbeeacf1491c95f326788021710e%401fe2a601fca941b7a3f781e0cd25954b%40bc0393320a3445688ec79b1187a883bf%4042fa45d86fca4d11befaecf0a942246d%406d988d19310843b18445bfb0f461ba8f%404a2437b9cd884233ad46554a0303ef89%407a0ed25eec54437f976e8f99fed6ee84%4009849b68da104e188c5ae11d7112827b"
total_pages = 3888

# Fetch all article links
all_article_links = fetch_all_articles(base_archive_url, total_pages)

# Scrape each article
for article_url in all_article_links:
    article_data = scrape_rfa_article(article_url)
    print(article_data)


Failed to fetch links from page 1: An error occurred while fetching the webpage: 403 Client Error: Forbidden for url: https://www.rfa.org/tibetan/story_archive?uids=e2b0f97935844f15a765322387ba2381%40c80eb7880fab441bb93e888d5fb0b8d7%40a34e3f118cfb47c89efc8d10ca29beb4%40ed8b21d453ee4bdbb56632bdf00ba533%404f49bd08f0c44e9384542e0d5f9c37f8%402f794fee5a644fdb9964a3accdff9162%403defd73dd1994e0db35cb48f998d6958%401b63853142714c3489c3e1c3fa848623%406153403d00e24f4a8b178048cc9a1820%4069e4cbeeacf1491c95f326788021710e%401fe2a601fca941b7a3f781e0cd25954b%40bc0393320a3445688ec79b1187a883bf%4042fa45d86fca4d11befaecf0a942246d%406d988d19310843b18445bfb0f461ba8f%404a2437b9cd884233ad46554a0303ef89%407a0ed25eec54437f976e8f99fed6ee84%4009849b68da104e188c5ae11d7112827b&start=0
Failed to fetch links from page 2: An error occurred while fetching the webpage: 403 Client Error: Forbidden for url: https://www.rfa.org/tibetan/story_archive?uids=e2b0f97935844f15a765322387ba2381%40c80eb7880fab441bb93e888d5fb0b8d7%4

KeyboardInterrupt: 