In [15]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import os
import json
import pandas as pd
import random

In [16]:
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)

In [17]:
CHECKPOINT_FILE = "scraper_checkpoint_full.json"
CSV_FILE = "businessday_full_articles.csv"
MAX_PAGES = 819

In [18]:
def setup_driver():
    options = uc.ChromeOptions()
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

In [19]:
def random_sleep(min_sec=2, max_sec=5):
    time.sleep(random.uniform(min_sec, max_sec))

def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    return {"page": 1, "visited_urls": []}

def save_checkpoint(page, visited_urls):
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump({"page": page, "visited_urls": visited_urls}, f)
    print(f"Checkpoint saved: Page {page} with {len(visited_urls)} articles visited.")


def load_existing_data():
    if os.path.exists(CSV_FILE):
        try:
            df = pd.read_csv(CSV_FILE, encoding='utf-8-sig', usecols=["Title", "Author", "Date", "Content", "URL"])
        except UnicodeDecodeError:
            df = pd.read_csv(CSV_FILE, encoding='latin1', usecols=["Title", "Author", "Date", "Content", "URL"])
        return df.values.tolist(), set(df["URL"].tolist())
    return [], set()


def scrape_listing_page(driver):
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        posts = driver.find_elements(By.CSS_SELECTOR, "div.post-info h2.post-title a")
        links = [a.get_attribute("href") for a in posts if a.get_attribute("href")]
        return links
    except Exception as e:
        print(f"Error scraping listing page: {e}")
        return []


In [20]:
def scrape_article(driver, url):
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article"))
        )
        title = driver.find_element(By.CSS_SELECTOR, "h1.post-title").text
        author = driver.find_element(By.CSS_SELECTOR, "p.author-name").text
        date = driver.find_element(By.CSS_SELECTOR, "p.post-date").text
        content_elem = driver.find_element(By.CSS_SELECTOR, "article div.post-content")
        content = content_elem.text.strip().replace('\n', ' ')
        return [title, author, date, content, url]
    except Exception as e:
        print(f"Error scraping article at {url}: {e}")
        return None
    

def save_data(data):
    df = pd.DataFrame(data, columns=['Title', 'Author', 'Date', 'Content', 'URL'])
    df.to_csv(CSV_FILE, index=False)
    print(f"Saved {len(data)} articles to {CSV_FILE}")

In [None]:
def main():
    print("Starting full article scraper for BusinessDay...")
    driver = setup_driver()
    checkpoint = load_checkpoint()
    scraped_data, visited_urls = load_existing_data()
    visited_urls = set(visited_urls)

    try:
        for page in range(checkpoint['page'], MAX_PAGES + 1):
            url = f"https://businessday.ng/tag/bdlead/page/{page}/?amp"
            print(f"\nScraping listing page {page}: {url}")
            driver.get(url)
            random_sleep(3, 6)

            article_links = scrape_listing_page(driver)
            print(f"Found {len(article_links)} article links")

            for article_url in article_links:
                if article_url in visited_urls:
                    print(f"Skipping already visited article: {article_url}")
                    continue

                result = scrape_article(driver, article_url)
                if result:
                    scraped_data.append(result)
                    visited_urls.add(article_url)
                    print(f"Scraped: {result[0][:60]}...")
                    random_sleep(2, 4)
                else:
                    print(f"Skipped article due to error: {article_url}")

            save_data(scraped_data)
            save_checkpoint(page, list(visited_urls))
            random_sleep(5, 8)

    except KeyboardInterrupt:
        print("Interrupted by user. Saving checkpoint...")
        save_data(scraped_data)
        save_checkpoint(page, list(visited_urls))

    except Exception as e:
        print(f"Unexpected error: {e}")
        save_data(scraped_data)
        save_checkpoint(page, list(visited_urls))

    finally:
        driver.quit()
        print("Scraping complete.")

if __name__ == "__main__":
    main()