In [1]:
import cloudscraper
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

BASE_URL = "https://www.dawn.com/business/business-finance"

def get_scraper():
    return cloudscraper.create_scraper(
        browser={
            "browser": "chrome",
            "platform": "windows",
            "mobile": False
        }
    )

def scrape_dawn_business_homepage(scraper):
    print(f"Scraping homepage: {BASE_URL}")
    response = scraper.get(BASE_URL, timeout=20)

    if response.status_code != 200:
        raise RuntimeError(f"HTTP {response.status_code}")

    soup = BeautifulSoup(response.text, "html.parser")

    articles = []

    for article in soup.select("article.story"):
        title_tag = article.select_one("h2.story__title a")
        if not title_tag:
            continue

        time_el = article.select_one(".timestamp--time")

        articles.append({
            "id": article.get("data-id"),
            "title": title_tag.get_text(strip=True),
            "url": urljoin("https://www.dawn.com", title_tag["href"]),
            "published_at": time_el.get("title") if time_el else None
        })
    
    print(f"Found {len(articles)} articles.")
    return articles

In [2]:
def scrape_dawn_article(scraper, url):
    print(f"Scraping article: {url}")
    try:
        response = scraper.get(url, timeout=20)

        if response.status_code != 200:
            print(f"⚠️ Failed {url} (HTTP {response.status_code})")
            return None

        soup = BeautifulSoup(response.text, "html.parser")

        article = soup.select_one("article.story[data-id]")
        if not article:
            print(f"⚠️ No article container {url}")
            return None

        # ---------- AUTHOR ----------
        author_el = article.select_one(".story__byline__link")
        author = author_el.get_text(strip=True) if author_el else None

        # ---------- PUBLISHED DATE ----------
        pub_el = article.select_one(".timestamp--published .timestamp--date")
        published_date = None
        if pub_el:
            try:
                published_date = datetime.strptime(
                    pub_el.get_text(strip=True),
                    "%B %d, %Y"
                )
            except Exception:
                pass

        # ---------- UPDATED TIME ----------
        upd_el = article.select_one(".timestamp--updated .timestamp--time")
        updated_at = None
        if upd_el and upd_el.has_attr("title"):
            try:
                updated_at = datetime.strptime(
                    upd_el["title"].replace("\n", " ").strip(),
                    "%B %d, %Y %I:%M%p"
                )
            except Exception:
                pass

        # ---------- CONTENT ----------
        paragraphs = []
        for p in article.select("div.story__content > p"):
            text = p.get_text(" ", strip=True)

            if not text:
                continue
            if text.lower().startswith("follow dawn"):
                continue

            paragraphs.append(text)

        content = "\n\n".join(paragraphs)

        return {
            "author": author,
            "published_date": published_date,
            "updated_at": updated_at,
            "content": content,
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [4]:

# 1. Initialize Scraper
scraper = get_scraper()

# 2. Get Articles from Homepage
homepage_articles = scrape_dawn_business_homepage(scraper)
# 3. Get Body for each article
full_articles = []

for article in homepage_articles:
    # Scrape details
    details = scrape_dawn_article(scraper, article["url"])

    if details:
        # Merge dictionary
        full_article = {**article, **details}
        full_articles.append(full_article)
        
    # Optional: Sleep to prevent rate limiting
    time.sleep(1.5)

# 4. Result
print(f"\nSuccessfully scraped {len(full_articles)} articles with details.")
    
# Display sample
if full_articles:
    print("-" * 40)
    print("Sample Article Data:")
    sample = full_articles[0]
    print(f"Title: {sample.get('title')}")
    print(f"Date: {sample.get('published_at')}")
    print(f"Author: {sample.get('author')}")
    print(f"Body Preview: {sample.get('content', '')[:200]}...")
    print("-" * 40)
    

Scraping homepage: https://www.dawn.com/business/business-finance
Found 59 articles.
Scraping article: https://www.dawn.com/news/1967697/pakistans-exchange-rate-illusion
Scraping article: https://www.dawn.com/news/1967696/the-us-dollar-in-a-turbulent-world
Scraping article: https://www.dawn.com/news/1967695/investing-in-silver
Scraping article: https://www.dawn.com/news/1967694/finance-a-hard-time-for-employment
Scraping article: https://www.dawn.com/news/1967693/agriculture-a-dysfunctional-vegetable-handling-system
Scraping article: https://www.dawn.com/news/1967692/boosting-pakistans-agriculture
Scraping article: https://www.dawn.com/news/1967691/kse-100-share-index-fluctuations
Scraping article: https://www.dawn.com/news/1967690/weekly-rupee-dollar-parity
Scraping article: https://www.dawn.com/news/1967666/the-ups-and-downs-of-tokenisation
Scraping article: https://www.dawn.com/news/1967665/company-news
Scraping article: https://www.dawn.com/news/1967664/struggle-for-self-reliance
S