In [1]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import time
from tqdm import tqdm 
import json

In [2]:

def extract_all_tbwriters_article_links(url: str) -> Dict[str, Any]:
    """
    Extracts all article links from a given tbwriters webpage.

    Args:
    url (str): The URL of the tbwriters webpage containing article links.

    Returns:
    Dict[str, Any]: A dictionary containing article links and status details.
    """
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    load_more = False
    
    try:
        start_time = time.time()
        # response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response = requests.get(url, headers=headers,)
        response.raise_for_status()
        end_time = time.time()
        if end_time - start_time > 50:
            print(f"This URL took more than 50s: {url}")

        soup = BeautifulSoup(response.content, 'html.parser')
        article_div = soup.find("div", class_="wrapper section medium-padding")
        if not article_div:
            raise ValueError("Could not find the main article container on the page.")
        
        all_articles = article_div.find_all("div", class_="post-container")
        if not all_articles:
            raise ValueError("Could not find the each article container on the page.")
        article_links = []
        for article in all_articles:
            # 
            title_Link = article.find("div", class_="post-header")
            if title_Link:
                links = title_Link.find("a")
                if links.get("href"):
                    article_links.append(links.get("href"))
        
        final_response["Links"] = article_links

        load_more_span = soup.find("section", role="archive-nav section-inner") # archive-nav section-inner | a = 
        if load_more_span:
            load_more = load_more_span.find("a", class_="post-nav-older fleft")
            if load_more:
                load_more = True
            else:
                load_more = False

        return final_response, load_more
    
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408
        return final_response, True
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response, True
    except ValueError as e:
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response, True
    except Exception as e:
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response, True






In [3]:
url = "https://www.tbwriters.com/?cat=4"
extract_all_tbwriters_article_links(url)

({'Links': ['https://www.tbwriters.com/?p=4671',
   'https://www.tbwriters.com/?p=4667',
   'https://www.tbwriters.com/?p=4623',
   'https://www.tbwriters.com/?p=4594',
   'https://www.tbwriters.com/?p=4514',
   'https://www.tbwriters.com/?p=4489',
   'https://www.tbwriters.com/?p=4481',
   'https://www.tbwriters.com/?p=4478',
   'https://www.tbwriters.com/?p=4473',
   'https://www.tbwriters.com/?p=4467',
   'https://www.tbwriters.com/?p=4464',
   'https://www.tbwriters.com/?p=4437',
   'https://www.tbwriters.com/?p=4376',
   'https://www.tbwriters.com/?p=4342',
   'https://www.tbwriters.com/?p=4226',
   'https://www.tbwriters.com/?p=4004',
   'https://www.tbwriters.com/?p=3800',
   'https://www.tbwriters.com/?p=3763',
   'https://www.tbwriters.com/?p=3522',
   'https://www.tbwriters.com/?p=3492',
   'https://www.tbwriters.com/?p=3466',
   'https://www.tbwriters.com/?p=3408',
   'https://www.tbwriters.com/?p=3382',
   'https://www.tbwriters.com/?p=2406',
   'https://www.tbwriters.com/?