In [None]:

import requests
from bs4 import BeautifulSoup
from docx import Document
import time
from tqdm.notebook import tqdm  # For Jupyter-friendly progress bar
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


BASE_URL = "https://"# Example base URL, replace with actual URL
# For example, if the novel is hosted on "https://example.com", set BASE_URL to that.
NOVEL_SLUG = "__"# Example slug, replace with actual slug
# For example, if the novel's URL is "https://example.com/novel-title", set NOVEL_SLUG to "novel-title".
NOVEL_URL = f"{BASE_URL}/{NOVEL_SLUG}.html"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

# Create a retry-enabled session
session = requests.Session()
retries = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["GET"],
    raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("http://", adapter)
session.mount("https://", adapter)

# Function to get chapter content
def get_chapter_content(url):
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        
        # Wait up to 15 seconds for either div.chapter-c or div.chapter-content to load
        wait = WebDriverWait(driver, 15)
        content_div = None

        # Try to wait for first div (chapter-c)
        try:
            content_div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.chapter-c")))
        except:
            # If not found, try second div (chapter-content)
            content_div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.chapter-content")))

        # Extract paragraphs inside the content div
        paragraphs = content_div.find_elements(By.TAG_NAME, "p")
        text = "\n".join(p.text.strip() for p in paragraphs if p.text.strip())
        return text if text else "[Content not found]"
    except Exception as e:
        return f"[Error fetching content: {e}]"
    finally:
        driver.quit()

#  Function to get all chapter links (with max page limit and repeat detection)
def get_chapter_links(max_pages=100):
    chapter_links = []
    seen_urls = set()
    page = 1
    last_count = 0
    repeat_pages = 0

    while page <= max_pages:
        url = f"{BASE_URL}/{NOVEL_SLUG}.html?page={page}"
        print(f"➡️ Fetching page {page}...")
        response = session.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, 'html.parser')
        chapters = soup.select('ul.list-chapter li a')

        if not chapters:
            print("✅ No chapters found on this page. Ending fetch.")
            break

        new_links = 0
        for chapter in chapters:
            title = chapter.text.strip()
            link = BASE_URL + chapter.get('href')
            if link not in seen_urls:
                seen_urls.add(link)
                chapter_links.append((title, link))
                new_links += 1

        print(f"📄 Found {new_links} new chapters on page {page}")

        # If nothing new was added, increase repeat counter
        if len(chapter_links) == last_count:
            repeat_pages += 1
        else:
            repeat_pages = 0

        if repeat_pages >= 3:
            print(" Detected 3 repeated pages in a row. Stopping to avoid infinite loop.")
            break

        last_count = len(chapter_links)
        page += 1

    return chapter_links[::1]

# Save to DOCX file with progress bar
def save_to_docx(chapters, start, end):
    doc = Document()
    doc.add_heading("A Stay-at-home Dad’s Restaurant In An Alternate World", 0)

    for i in tqdm(range(start - 1, end), desc="Scraping chapters", unit="chapter"):
        title, url = chapters[i]
        print(f"📘 Fetching: {title}")
        content = get_chapter_content(url)

        doc.add_heading(f"{title}", level=1)
        doc.add_paragraph(content)
        time.sleep(1)  # Be respectful

    filename = f"Chapters_{start}_to_{end}.docx"
    doc.save(filename)
    print(f"\n✅ Done! Saved to {filename}")

#  Run scraper with input range
print("Fetching chapter links...")
chapters = get_chapter_links(max_pages=100)
total_chapters = len(chapters)
print(f"📚 Total chapters found: {total_chapters}")

try:
    start = int(input(f"Enter starting chapter number (1 to {total_chapters}): "))
    end = int(input(f"Enter ending chapter number ({start} to {total_chapters}): "))

    if start < 1 or end > total_chapters or start > end:
        raise ValueError("Invalid range.")

    save_to_docx(chapters, start, end)
except ValueError as e:
    print("❌ Error:", e)
