In [1]:
from docx import Document
import os

def split_docx_by_word_count(file_path, output_dir, chunk_size=25000):
    # Load the document
    doc = Document(file_path)
    full_text = []
    chunk_text = []
    chunk_counter = 1
    current_word_count = 0

    # Collect all paragraphs in the document
    for para in doc.paragraphs:
        full_text.append(para.text)
    
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Function to save a chunk to a new .docx file
    def save_chunk(chunk, counter):
        new_doc = Document()
        for para in chunk:
            new_doc.add_paragraph(para)
        new_doc.save(os.path.join(output_dir, f"Konoha_{counter}.docx"))
    
    # Iterate over the paragraphs to create chunks
    for para in full_text:
        words = para.split()
        chunk_text.append(para)
        current_word_count += len(words)
        
        # Check if the current paragraph is a title and the word count exceeds the chunk size
        if current_word_count > chunk_size: #and para.startswith("Xiao Yingzi"):
            save_chunk(chunk_text, chunk_counter)
            chunk_text = []
            chunk_counter += 1
            current_word_count = 0

    # Save any remaining text as the last chunk
    if chunk_text:
        save_chunk(chunk_text, chunk_counter)
        
split_docx_by_word_count(
    r"C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha-Sect.docx", 
    r"C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha-Sect"
)


In [1]:
from docx import Document
import os

def delete_content_before_phrase(file_path, phrase, save_path):
    # Load the document
    doc = Document(file_path)

    # Find the index of the paragraph containing the phrase
    start_index = None
    for i, paragraph in enumerate(doc.paragraphs):
        if phrase in paragraph.text:
            start_index = i
            break

    if start_index is None:
        print(f'Phrase "{phrase}" not found in the document.')
        return

    # Delete all content before the paragraph containing the phrase
    for i in range(start_index):
        doc.paragraphs[0].clear()  # Remove content from each paragraph
        p = doc.paragraphs[0]._element
        p.getparent().remove(p)  # Remove the paragraph element

    # Ensure the save path exists
    os.makedirs(save_path, exist_ok=True)
    
    # Save the modified document to the new path
    new_file_path = os.path.join(save_path, 'Yingzi Master Doc.docx')
    doc.save(new_file_path)
    print(f'Content before "{phrase}" has been deleted. Document saved as "{new_file_path}".')

# Example usage
file_path = r'C:\Users\Lenovo\Downloads\Mad_With_Power2.docx'
phrase = 'Xiao Yingzi - Good Seed Background'
save_path = r'C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Yingzi'

delete_content_before_phrase(file_path, phrase, save_path)


Content before "Xiao Yingzi - Good Seed Background" has been deleted. Document saved as "C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Yingzi\Yingzi Master Doc.docx".


In [2]:
import requests
from bs4 import BeautifulSoup
from docx import Document
import re
import os

# List of relevant names for threadmark labels
relevant_names = [
    "Minato"
]

# Convert relevant names to lowercase for case-insensitive comparison
relevant_names_lower = [name.lower() for name in relevant_names]

# Create a dictionary to hold lists of Document objects for each relevant name
documents = {name: [Document()] for name in relevant_names}
print("Initialized Word documents for each relevant name.")

# Define the base URL for the forum thread
base_url = 'https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-'

# Define the directory to save the documents
save_directory = r"C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha-Sect"

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Define the word limit per document
word_limit = 25000

# Function to get the total word count of a document
def get_word_count(doc):
    return sum(len(p.text.split()) for p in doc.paragraphs)

# Loop through the page numbers from 1 to the specified upper limit
for page_num in range(1, 202):  # Adjust the upper limit as needed
    print(f"Processing page {page_num}...")

    # URL of the page to scrape
    url = f'{base_url}{page_num}'
    print(f"URL: {url}")

    # Send a GET request to the URL
    response = requests.get(url)
    print(f"Sent GET request to {url} (Status code: {response.status_code})")

    # Parse the HTML content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    print("Parsed HTML content.")

    # Find all the articles on the page
    articles = soup.find_all("article")
    print(f"Found {len(articles)} articles.")

    # Loop through each article and check the content
    for idx, article in enumerate(articles):
        print(f"Processing article {idx + 1}/{len(articles)} on page {page_num}.")

        # Check if the article contains any relevant name (case-insensitive)
        article_text = article.get_text().lower()
        for name, name_lower in zip(relevant_names, relevant_names_lower):
            if name_lower in article_text:
                print(f"Article contains relevant name: {name}")

                # Find the bbWrapper div in the article
                bb_wrapper = article.find("div", class_="bbWrapper")
                if bb_wrapper:
                    print("Found bbWrapper div.")
                    
                    # Add the text in the bbWrapper div to the corresponding document
                    bb_wrapper_text = bb_wrapper.get_text()
                    
                    # Get the current document and word count
                    current_doc = documents[name][-1]
                    current_word_count = get_word_count(current_doc)
                    
                    # Check if adding this article exceeds the word limit
                    article_word_count = len(bb_wrapper_text.split()) + 1  # Adding 1 for the separator
                    
                    if current_word_count + article_word_count > word_limit:
                        # Create a new document and add it to the list
                        new_doc = Document()
                        documents[name].append(new_doc)
                        current_doc = new_doc
                        print(f"Created new document for {name} due to word limit.")

                    # Add content to the current document
                    current_doc.add_paragraph(bb_wrapper_text)
                    current_doc.add_paragraph("--------------------")
                    print(f"Added content to document for {name}.")
    
    print(f"Finished processing page {page_num}.\n")

# Save each document as a separate Word file in the specified directory
for name, doc_list in documents.items():
    for i, doc in enumerate(doc_list):
        save_path = os.path.join(save_directory, f"{name}_{i+1}.docx")
        doc.save(save_path)
        print(f"Saved document as '{save_path}'.")


Initialized Word documents for each relevant name.
Processing page 1...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-1
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-1 (Status code: 200)
Parsed HTML content.
Found 21 articles.
Processing article 1/21 on page 1.
Processing article 2/21 on page 1.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 3/21 on page 1.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 4/21 on page 1.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 5/21 on page 1.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 6/21 on page 1.
Article contains relevant name: Minato
F

Added content to document for Minato.
Processing article 9/20 on page 4.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 10/20 on page 4.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 11/20 on page 4.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 12/20 on page 4.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 13/20 on page 4.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 14/20 on page 4.
Ar

Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 5/20 on page 7.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 6/20 on page 7.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 7/20 on page 7.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 8/20 on page 7.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 9/20 on page 7.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 10/20 on page 7.
Article contains relevant name: Minato
Found bbWrapper di

Added content to document for Minato.
Processing article 12/20 on page 9.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 13/20 on page 9.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 14/20 on page 9.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 15/20 on page 9.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 16/20 on page 9.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 17/20 on page 9.
Article contains relevant name: Minato
Found bbWrapp

Added content to document for Minato.
Finished processing page 12.

Processing page 13...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-13
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-13 (Status code: 200)
Parsed HTML content.
Found 14 articles.
Processing article 1/14 on page 13.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 2/14 on page 13.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 3/14 on page 13.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 4/14 on page 13.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 5/14 on page 13.

Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 14/14 on page 16.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Finished processing page 16.

Processing page 17...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-17
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-17 (Status code: 200)
Parsed HTML content.
Found 14 articles.
Processing article 1/14 on page 17.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 2/14 on page 17.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 3/14 on page 17.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to docu

Added content to document for Minato.
Processing article 12/14 on page 20.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 13/14 on page 20.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Processing article 14/14 on page 20.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Finished processing page 20.

Processing page 21...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-21
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-21 (Status code: 200)
Parsed HTML content.
Found 14 articles.
Processing article 1/14 on page 21.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to do

Added content to document for Minato.
Processing article 14/14 on page 24.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Finished processing page 24.

Processing page 25...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-25
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-25 (Status code: 200)
Parsed HTML content.
Found 14 articles.
Processing article 1/14 on page 25.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 2/14 on page 25.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 3/14 on page 25.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 4/14 on page 25

Added content to document for Minato.
Processing article 14/14 on page 28.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Finished processing page 28.

Processing page 29...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-29
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-29 (Status code: 200)
Parsed HTML content.
Found 14 articles.
Processing article 1/14 on page 29.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 2/14 on page 29.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 3/14 on page 29.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 4/14 on page 29

Added content to document for Minato.
Processing article 14/14 on page 32.
Article contains relevant name: Minato
Found bbWrapper div.
Created new document for Minato due to word limit.
Added content to document for Minato.
Finished processing page 32.

Processing page 33...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-33
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-33 (Status code: 200)
Parsed HTML content.
Found 14 articles.
Processing article 1/14 on page 33.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 2/14 on page 33.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 3/14 on page 33.
Article contains relevant name: Minato
Found bbWrapper div.
Added content to document for Minato.
Processing article 4/14 on page 33

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [3]:
from docx import Document
import os
import shutil

def split_docx_by_word_count(file_path, output_base_dir, chunk_size=25000):
    # Load the document
    doc = Document(file_path)
    full_text = []
    chunk_text = []
    chunk_counter = 1
    current_word_count = 0

    # Collect all paragraphs in the document
    for para in doc.paragraphs:
        full_text.append(para.text)
    
    # Create the output directory
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    output_dir = os.path.join(output_base_dir, file_name)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    # Copy the original document to the new directory
    shutil.copy(file_path, os.path.join(output_dir, os.path.basename(file_path)))

    # Function to save a chunk to a new .docx file
    def save_chunk(chunk, counter):
        new_doc = Document()
        for para in chunk:
            new_doc.add_paragraph(para)
        new_doc.save(os.path.join(output_dir, f"{file_name}_{counter}.docx"))
    
    # Iterate over the paragraphs to create chunks
    for para in full_text:
        words = para.split()
        chunk_text.append(para)
        current_word_count += len(words)
        
        # Check if the current paragraph is a title and the word count exceeds the chunk size
        if current_word_count > chunk_size and para.startswith(file_name):
            save_chunk(chunk_text, chunk_counter)
            chunk_text = []
            chunk_counter += 1
            current_word_count = 0

    # Save any remaining text as the last chunk
    if chunk_text:
        save_chunk(chunk_text, chunk_counter)

# List of document names
doc_names = [
    'Sun Ji.docx',
    'Konstantinos Papadopoulos.docx',
    'Mildgyð Galene.docx',
    'David Pupillus.docx',
    'Simon Euaerizo.docx',
    'Aliki.docx',
    'Achille.docx'
]

# Directory where the script is stored
script_dir = os.getcwd()

# Directory where the split documents will be stored
output_base_dir = os.path.join(script_dir, 'Output')

# Ensure the output base directory exists
if not os.path.exists(output_base_dir):
    os.makedirs(output_base_dir)

# Loop through the documents and run the split function
for doc_name in doc_names:
    file_path = os.path.join(script_dir, doc_name)
    split_docx_by_word_count(file_path, output_base_dir)
    print(f"Processed and saved document: {doc_name}")


Processed and saved document: Sun Ji.docx
Processed and saved document: Konstantinos Papadopoulos.docx
Processed and saved document: Mildgyð Galene.docx
Processed and saved document: David Pupillus.docx
Processed and saved document: Simon Euaerizo.docx
Processed and saved document: Aliki.docx
Processed and saved document: Achille.docx


In [1]:
import requests
from bs4 import BeautifulSoup
from docx import Document
import re
import os

def scrape_and_save_threadmarked_articles(output_file_path):
    # Create a new Word document
    document = Document()
    print("Initialized new Word document.")

    # Loop through the page numbers from 1 to 6
    for page_num in range(1, 7):
        print(f"Processing page {page_num}...")

        # URL of the page to scrape
        url = f'https://forums.sufficientvelocity.com/threads/magic-research-and-development-quest.71592/page-{page_num}'
        print(f"URL: {url}")

        # Send a GET request to the URL
        response = requests.get(url)
        print(f"Sent GET request to {url} (Status code: {response.status_code})")

        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        print("Parsed HTML content.")

        # Find all the articles on the page with threadmarks
        articles = soup.find_all("article", attrs={"class": re.compile(r".*\bhasThreadmark\b.*")})
        print(f"Found {len(articles)} threadmarked articles.")

        # Loop through each article and extract the text
        for idx, article in enumerate(articles):
            print(f"Processing article {idx + 1}/{len(articles)} on page {page_num}.")

            # Get the data-author attribute
            data_author = article.get("data-author", "Unknown Author")
            document.add_paragraph(f"Author: {data_author}")
            print(f"Noted data author: {data_author}")

            # Find the bbWrapper div in the article
            bb_wrapper = article.find("div", class_="bbWrapper")
            if bb_wrapper:
                print("Found bbWrapper div.")
                
                # Find the text in the span with class "threadmarkLabel"
                span = article.find("span", class_="threadmarkLabel")
                if span:
                    span_text = span.get_text()
                    document.add_paragraph(f"Threadmark: {span_text}")
                    print(f"Added threadmark label text: {span_text}")

                # Add the text in the bbWrapper div to the document
                bb_wrapper_text = bb_wrapper.get_text()
                document.add_paragraph(bb_wrapper_text)
                print("Added main content text from bbWrapper div.")

                document.add_paragraph("--------------------")
                print("Added separator line.")
        
        print(f"Finished processing page {page_num}.\n")

    # Save the document as a Word file
    document.save(output_file_path)
    print(f"Saved document as '{output_file_path}'.")

# Example usage
output_file_path = r"C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Magic_Research_Dev.docx"
scrape_and_save_threadmarked_articles(output_file_path)


Initialized new Word document.
Processing page 1...
URL: https://forums.sufficientvelocity.com/threads/magic-research-and-development-quest.71592/page-1
Sent GET request to https://forums.sufficientvelocity.com/threads/magic-research-and-development-quest.71592/page-1 (Status code: 200)
Parsed HTML content.
Found 6 threadmarked articles.
Processing article 1/6 on page 1.
Noted data author: Quest
Found bbWrapper div.
Added threadmark label text: Character Creation:
Added main content text from bbWrapper div.
Added separator line.
Processing article 2/6 on page 1.
Noted data author: Quest
Found bbWrapper div.
Added threadmark label text: Character Sheet
Added main content text from bbWrapper div.
Added separator line.
Processing article 3/6 on page 1.
Noted data author: Quest
Found bbWrapper div.
Added threadmark label text: Mechanics
Added main content text from bbWrapper div.
Added separator line.
Processing article 4/6 on page 1.
Noted data author: Quest
Found bbWrapper div.
Added thr

In [1]:
import requests
from bs4 import BeautifulSoup
from docx import Document
import os
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_and_save_all_articles(base_url, page_range, output_file_path, author_filter=None):
    # Create a new Word document
    document = Document()
    logging.info("Initialized new Word document.")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Loop through the specified range of pages
    for page_num in page_range:
        logging.info(f"Processing page {page_num}...")

        # Construct the URL of the page to scrape
        url = f"{base_url}/page-{page_num}"
        logging.info(f"URL: {url}")

        try:
            # Send a GET request to the URL
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Check if the request was successful
            logging.info(f"Sent GET request to {url} (Status code: {response.status_code})")

            # Parse the HTML content of the response using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            logging.info("Parsed HTML content.")

            # Find all the articles on the page
            articles = soup.find_all("article")
            logging.info(f"Found {len(articles)} articles.")

            # Loop through each article and extract the text
            for idx, article in enumerate(articles):
                logging.info(f"Processing article {idx + 1}/{len(articles)} on page {page_num}.")

                # Get the data-author attribute
                data_author = article.get("data-author", "Unknown Author")
                if author_filter and data_author != author_filter:
                    logging.info(f"Skipping article by {data_author}")
                    continue

                document.add_paragraph(f"Author: {data_author}")
                logging.info(f"Noted data author: {data_author}")

                # Find the bbWrapper div in the article
                bb_wrapper = article.find("div", class_="bbWrapper")
                if bb_wrapper:
                    logging.info("Found bbWrapper div.")

                    # Add the text in the bbWrapper div to the document
                    bb_wrapper_text = bb_wrapper.get_text()
                    document.add_paragraph(bb_wrapper_text)
                    logging.info("Added main content text from bbWrapper div.")

                    document.add_paragraph("--------------------")
                    logging.info("Added separator line.")
                
            logging.info(f"Finished processing page {page_num}.\n")

        except requests.RequestException as e:
            logging.error(f"Failed to fetch {url}: {e}")
        except Exception as e:
            logging.error(f"An error occurred while processing the page: {e}")

    # Save the document as a Word file
    document.save(output_file_path)
    logging.info(f"Saved document as '{output_file_path}'.")

# Example usage
base_url = 'https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader'
page_range = range(1, 11)
output_file_path = r"C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha-Sect.docx"
author_filter = None  # Set to a specific author name to filter by author, or None to include all authors

scrape_and_save_all_articles(base_url, page_range, output_file_path, author_filter)


Initialized new Word document.
Processing page 1...
URL: https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-1
Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-1 (Status code: 200)
Parsed HTML content.
Found 21 articles.
Processing article 1/21 on page 1.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 2/21 on page 1.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 3/21 on page 1.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 4/21 on page 1.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 5/21 on page 1.
Noted dat

Added main content text from bbWrapper div.
Added separator line.
Processing article 13/20 on page 3.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 14/20 on page 3.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 15/20 on page 3.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 16/20 on page 3.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 17/20 on page 3.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 18/20 on page 3.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
P

Sent GET request to https://forums.sufficientvelocity.com/threads/konoha-sect-of-chakra-cultivation.103917/reader/page-6 (Status code: 200)
Parsed HTML content.
Found 20 articles.
Processing article 1/20 on page 6.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 2/20 on page 6.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 3/20 on page 6.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 4/20 on page 6.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 5/20 on page 6.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 6/20 on page 6.
Noted data aut

Added main content text from bbWrapper div.
Added separator line.
Processing article 9/20 on page 8.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 10/20 on page 8.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 11/20 on page 8.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 12/20 on page 8.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 13/20 on page 8.
Noted data author: StarJaunter
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Processing article 14/20 on page 8.
Noted data author: Unknown Author
Found bbWrapper div.
Added main content text from bbWrapper div.
Added separator line.
Pr

Saved document as 'C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha-Sect.docx'.


In [1]:
import os
from docx import Document

def split_doc(file_path, output_folder, max_size=20000):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load the document
    doc = Document(file_path)

    # Initialize variables
    current_doc = Document()
    doc_num = 1
    current_size = 0

    # Iterate through paragraphs and split into smaller documents
    for para in doc.paragraphs:
        # Add paragraph to current document
        current_doc.add_paragraph(para.text)

        # Estimate the size of the current document in bytes
        current_size += len(para.text.encode('utf-8'))

        # If the current document size exceeds the max size, save and start a new document
        if current_size >= max_size:
            # Save the current document
            output_path = os.path.join(output_folder, f'Konoha Sect Split {doc_num}.docx')
            current_doc.save(output_path)

            # Increment document number and reset current document and size
            doc_num += 1
            current_doc = Document()
            current_size = 0

    # Save any remaining paragraphs in the last document
    if len(current_doc.paragraphs) > 0:
        output_path = os.path.join(output_folder, f'Konoha Sect Split {doc_num}.docx')
        current_doc.save(output_path)

if __name__ == "__main__":
    file_location = r'C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha-Sect.docx'
    output_folder = r'C:\Users\Lenovo\OneDrive\Documents\Books\Good Seeds\Konoha Sect'

    split_doc(file_location, output_folder)
