In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import threading
import requests
from bs4 import BeautifulSoup
from docx import Document
import logging
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Setting up basic logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class ScraperApp:
    def __init__(self, root):
        # Initialize the main application window
        self.root = root
        self.root.title("Web Scraper")
        self.create_widgets()  # Call the method to create UI elements

    def create_widgets(self):
        # Creating UI elements for user input and actions
        tk.Label(self.root, text="Base URL:").grid(row=0, column=0, padx=10, pady=5)
        self.base_url_entry = tk.Entry(self.root, width=50)
        self.base_url_entry.grid(row=0, column=1, padx=10, pady=5)
        
        tk.Label(self.root, text="Page Range Start:").grid(row=1, column=0, padx=10, pady=5)
        self.page_start_entry = tk.Entry(self.root)
        self.page_start_entry.grid(row=1, column=1, padx=10, pady=5)

        tk.Label(self.root, text="Page Range End:").grid(row=2, column=0, padx=10, pady=5)
        self.page_end_entry = tk.Entry(self.root)
        self.page_end_entry.grid(row=2, column=1, padx=10, pady=5)

        tk.Label(self.root, text="Author Filter (optional):").grid(row=3, column=0, padx=10, pady=5)
        self.author_filter_entry = tk.Entry(self.root)
        self.author_filter_entry.grid(row=3, column=1, padx=10, pady=5)

        # New UI element for relevant names filtering
        tk.Label(self.root, text="Relevant Names (comma separated, optional):").grid(row=4, column=0, padx=10, pady=5)
        self.relevant_names_entry = tk.Entry(self.root)
        self.relevant_names_entry.grid(row=4, column=1, padx=10, pady=5)

        # New UI element for setting a word limit per document
        tk.Label(self.root, text="Word Limit Per Document (optional):").grid(row=5, column=0, padx=10, pady=5)
        self.word_limit_entry = tk.Entry(self.root)
        self.word_limit_entry.grid(row=5, column=1, padx=10, pady=5)

        # Checkbox for threadmarked articles only
        self.threadmarked_only_var = tk.IntVar()
        self.threadmarked_only_check = tk.Checkbutton(self.root, text="Threadmarked Articles Only", variable=self.threadmarked_only_var)
        self.threadmarked_only_check.grid(row=6, columnspan=2, pady=5)

        # Checkbox for including threadmark labels
        self.include_threadmark_var = tk.IntVar()
        self.include_threadmark_check = tk.Checkbutton(self.root, text="Include Threadmark Labels", variable=self.include_threadmark_var)
        self.include_threadmark_check.grid(row=7, columnspan=2, pady=5)

        # Checkbox for including author tags
        self.include_author_var = tk.IntVar()
        self.include_author_check = tk.Checkbutton(self.root, text="Include Author Tags", variable=self.include_author_var)
        self.include_author_check.grid(row=8, columnspan=2, pady=5)

        # Checkbox for including separator lines
        self.include_separator_var = tk.IntVar()
        self.include_separator_check = tk.Checkbutton(self.root, text="Include Separator Lines", variable=self.include_separator_var)
        self.include_separator_check.grid(row=9, columnspan=2, pady=5)

        # Button to start the scraper
        tk.Button(self.root, text="Run Scraper", command=self.start_scraping_thread).grid(row=10, columnspan=2, pady=10)

        # Progress bar to show the progress of the scraping process
        self.progress = ttk.Progressbar(self.root, orient="horizontal", length=400, mode="determinate")
        self.progress.grid(row=11, columnspan=2, pady=5)

        # Text box to display logs and messages to the user
        self.log_text = tk.Text(self.root, height=10, state='disabled', wrap='word')
        self.log_text.grid(row=12, columnspan=2, padx=10, pady=5)

    def log_message(self, message):
        # Method to log messages in the text box
        self.log_text.config(state='normal')
        self.log_text.insert(tk.END, message + '\n')
        self.log_text.see(tk.END)
        self.log_text.config(state='disabled')

    def start_scraping_thread(self):
        # Start the scraping process in a separate thread to avoid freezing the UI
        thread = threading.Thread(target=self.run_scraper)
        thread.start()

    def run_scraper(self):
        # Main method to handle the scraping process

        # Get the base URL and remove any trailing slashes
        base_url = self.base_url_entry.get().rstrip('/')
        if not base_url.startswith("http"):
            messagebox.showerror("Invalid input", "Base URL must be a valid URL starting with http or https.")
            return

        try:
            # Get the page range from the user input
            page_start = int(self.page_start_entry.get())
            page_end = int(self.page_end_entry.get())
        except ValueError:
            # Show an error if the page range is not a valid integer
            messagebox.showerror("Invalid input", "Page range must be integers.")
            return

        # Ask the user to choose a directory to save the output files
        output_dir = filedialog.askdirectory()
        if not output_dir:
            return

        # Process the relevant names input into a list, if provided
        relevant_names_input = self.relevant_names_entry.get()
        relevant_names = [name.strip() for name in relevant_names_input.split(",")] if relevant_names_input else []
        relevant_names_lower = [name.lower() for name in relevant_names]
        documents = {name: [Document()] for name in relevant_names} if relevant_names else {"default": [Document()]}

        # Optional author filter
        author_filter = self.author_filter_entry.get() or None

        # Optional word limit
        try:
            word_limit = int(self.word_limit_entry.get()) if self.word_limit_entry.get() else None
        except ValueError:
            messagebox.showerror("Invalid input", "Word limit must be an integer.")
            return

        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)

        # Use ThreadPoolExecutor to handle multiple page requests concurrently
        with ThreadPoolExecutor() as executor:
            future_to_page = {
                executor.submit(
                    self.scrape_page, base_url, page_num, author_filter, relevant_names_lower, documents, word_limit
                ): page_num for page_num in range(page_start, page_end + 1)
            }

            # Process the results as they are completed
            for future in as_completed(future_to_page):
                page_num = future_to_page[future]
                try:
                    future.result()  # Retrieve the result of the scraping
                except Exception as e:
                    # Log any errors encountered during the scraping process
                    self.log_message(f"Error processing page {page_num}: {e}")
                # Update the progress bar
                self.progress["value"] += 1
                self.root.update_idletasks()

        # Save each document in the output directory
        if relevant_names:
            for name, doc_list in documents.items():
                for i, doc in enumerate(doc_list):
                    save_path = os.path.join(output_dir, f"{name}_{i+1}.docx")
                    doc.save(save_path)
                    self.log_message(f"Saved document as '{save_path}'.")
        else:
            # If no relevant names were provided, save a single document
            save_path = os.path.join(output_dir, "scraped_content.docx")
            documents["default"][0].save(save_path)
            self.log_message(f"Saved document as '{save_path}'.")

        # Notify the user that the scraping is complete
        messagebox.showinfo("Success", "Scraping completed successfully!")

    def scrape_page(self, base_url, page_num, author_filter, relevant_names_lower, documents, word_limit):
        # Method to scrape a single page of content

        # Set up headers for the HTTP request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        url = f"{base_url}/page-{page_num}"  # Construct the URL for the current page
        self.log_message(f"Fetching page {page_num}...")

        try:
            # Send the GET request to fetch the page content
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Check for HTTP errors
            soup = BeautifulSoup(response.content, 'html.parser')  # Parse the HTML content

            # Filter for threadmarked articles if the option is selected
            if self.threadmarked_only_var.get():
                articles = soup.find_all("article", attrs={"class": re.compile(r".*\bhasThreadmark\b.*")})
            else:
                articles = soup.find_all("article")

            for article in articles:
                article_text = article.get_text().lower()

                # Check author filter
                if author_filter and author_filter.lower() not in article_text:
                    continue

                if relevant_names_lower:
                    for name, name_lower in zip(documents.keys(), relevant_names_lower):
                        if name_lower in article_text:
                            self.process_article(article, name, documents, word_limit)
                            break
                else:
                    self.process_article(article, "default", documents, word_limit)

        except requests.RequestException as e:
            self.log_message(f"Failed to fetch {url}: {e}")

    def process_article(self, article, name, documents, word_limit):
        # Method to process and add an article's content to a document
        bb_wrapper = article.find("div", class_="bbWrapper")
        if bb_wrapper:
            current_doc = documents[name][-1]
            current_word_count = sum(len(p.text.split()) for p in current_doc.paragraphs)

            if word_limit and current_word_count + len(bb_wrapper.get_text().split()) > word_limit:
                new_doc = Document()
                documents[name].append(new_doc)
                current_doc = new_doc

            # Include author tag if selected
            if self.include_author_var.get():
                data_author = article.get("data-author", "Unknown Author")
                current_doc.add_paragraph(f"Author: {data_author}")
                self.log_message(f"Added author: {data_author}")

            # Include threadmark label if selected
            if self.include_threadmark_var.get():
                threadmark_label = article.find("span", class_="threadmarkLabel")
                if threadmark_label:
                    threadmark_text = threadmark_label.get_text()
                    current_doc.add_paragraph(f"Threadmark: {threadmark_text}")
                    self.log_message(f"Added threadmark label: {threadmark_text}")

            # Add the article's main content
            bb_wrapper_text = bb_wrapper.get_text()
            current_doc.add_paragraph(bb_wrapper_text)

            # Include separator line if selected
            if self.include_separator_var.get():
                current_doc.add_paragraph("--------------------")
                self.log_message("Added separator line.")

if __name__ == "__main__":
    # Initialize the main application loop
    root = tk.Tk()
    app = ScraperApp(root)
    root.mainloop()


In [6]:
pip install aiohttp


Collecting aiohttp
  Downloading aiohttp-3.10.2-cp39-cp39-win_amd64.whl (378 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.4.1-cp39-cp39-win_amd64.whl (50 kB)
Collecting async-timeout<5.0,>=4.0
  Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.9.4-cp39-cp39-win_amd64.whl (76 kB)
Collecting aiohappyeyeballs>=2.3.0
  Downloading aiohappyeyeballs-2.3.5-py3-none-any.whl (12 kB)
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.5-cp39-cp39-win_amd64.whl (28 kB)
Installing collected packages: multidict, frozenlist, yarl, async-timeout, aiosignal, aiohappyeyeballs, aiohttp
Successfully installed aiohappyeyeballs-2.3.5 aiohttp-3.10.2 aiosignal-1.3.1 async-timeout-4.0.3 frozenlist-1.4.1 multidict-6.0.5 yarl-1.9.4

