In [1]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import time
from tqdm import tqdm 
import json

In [5]:

def extract_all_tbwriters_article_links(url: str) -> Dict[str, Any]:
    """
    Extracts all article links from a given tbwriters webpage.

    Args:
    url (str): The URL of the tbwriters webpage containing article links.

    Returns:
    Dict[str, Any]: A dictionary containing article links and status details.
    """
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    load_more = False
    
    try:
        start_time = time.time()
        # response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response = requests.get(url, headers=headers,)
        response.raise_for_status()
        end_time = time.time()
        if end_time - start_time > 50:
            print(f"This URL took more than 50s: {url}")

        soup = BeautifulSoup(response.content, 'html.parser')
        article_div = soup.find("div", class_="wrapper section medium-padding")
        if not article_div:
            raise ValueError("Could not find the main article container on the page.")
        
        all_articles = article_div.find_all("div", class_="post-container")
        if not all_articles:
            raise ValueError("Could not find the each article container on the page.")
        article_links = []
        for article in all_articles:
            # 
            title_Link = article.find("div", class_="post-header")
            if title_Link:
                links = title_Link.find("a")
                if links.get("href"):
                    article_links.append(links.get("href"))
        
        final_response["Links"] = article_links

        load_more_span = soup.find("section", role="archive-nav section-inner") # archive-nav section-inner | a = 
        if load_more_span:
            load_more = load_more_span.find("a", class_="post-nav-older fleft")
            if load_more:
                load_more = True
            else:
                load_more = False

        return final_response, load_more
    
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408
        return final_response, True
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response, True
    except ValueError as e:
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response, True
    except Exception as e:
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response, True






In [6]:
url = "https://www.tbwriters.com/?cat=4"
extract_all_tbwriters_article_links(url)

KeyboardInterrupt: 

In [None]:
import requests
from requests.exceptions import RequestException, Timeout, TooManyRedirects, SSLError

url = "https://www.tbwriters.com/?cat=4"  # Replace with your target URL if different

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-encoding': 'gzip, deflate, br, zstd',
    'accept-language': 'en-US,en;q=0.9,en-IN;q=0.8',
    'cache-control': 'max-age=0',
    'referer': 'https://74.208.45.221:48189/',
    'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'cross-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}

try:
    response = requests.get(url, headers=headers, timeout=10, allow_redirects=True, verify=True)
    response.raise_for_status()  # Raises an HTTPError for bad responses
    print(f"Status Code: {response.status_code}")
    print(f"Final URL after redirects: {response.url}")
    print(f"Response Headers: {response.headers}")
    # print(response.text[:500])  # Print first 500 characters of the response
except Timeout:
    print("The request timed out. The server is taking too long to respond.")
except TooManyRedirects:
    print("Too many redirects. The request exceeded the configured number of maximum redirections.")
except SSLError as e:
    print(f"SSL Error occurred: {e}")
except RequestException as e:
    print(f"An error occurred: {e}")

In [9]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Set up the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

try:
    # Navigate to the website
    url = "https://www.tbwriters.com/?cat=4"
    driver.get(url)

    # Wait for the page to load (adjust the timeout as needed)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # Print the page title to confirm we've loaded the page
    print(f"Page title: {driver.title}")

    # Example: Find all article titles
    article_titles = driver.find_elements(By.CSS_SELECTOR, "h2.entry-title a")
    for title in article_titles:
        print(f"Article title: {title.text}")

    # Example: Find and print the date and author of the first article
    first_article = driver.find_element(By.CSS_SELECTOR, "article")
    date = first_article.find_element(By.CSS_SELECTOR, ".posted-on time").text
    author = first_article.find_element(By.CSS_SELECTOR, ".byline .author a").text
    print(f"First article date: {date}")
    print(f"First article author: {author}")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser
    driver.quit()

WebDriverException: Message: Service /root/.wdm/drivers/chromedriver/linux64/114.0.5735.90/chromedriver unexpectedly exited. Status code was: 127


In [11]:
!pip install requests-html

[0mCollecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.1-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests-html)
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting bs4 (from requests-html)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting w3lib (from requests-html)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting appdirs<2.0.0,>=1.4.3 (from pyppeteer>=0.0.14->requests-html)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting importlib-metadata>=1.4 (from pyppeteer>=0.0.14->requests-html)
  Downloading importlib_metadata-8.5.0-py3-none-a

In [12]:
from requests_html import HTMLSession
import time

# Create an HTML Session
session = HTMLSession()

# URL to scrape
url = "https://www.tbwriters.com/?cat=4"

try:
    # Send a GET request to the URL
    response = session.get(url)

    # Render the page (this will execute JavaScript)
    response.html.render(timeout=20)

    # Print the page title
    print(f"Page title: {response.html.find('title', first=True).text}")

    # Find all article titles
    article_titles = response.html.find('h2.entry-title a')
    for title in article_titles:
        print(f"Article title: {title.text}")

    # Find and print the date and author of the first article
    first_article = response.html.find('article', first=True)
    if first_article:
        date = first_article.find('.posted-on time', first=True)
        author = first_article.find('.byline .author a', first=True)
        if date:
            print(f"First article date: {date.text}")
        if author:
            print(f"First article author: {author.text}")
    else:
        print("No articles found")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the session
    session.close()

ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
Install lxml[html_clean] or lxml_html_clean directly.