In [1]:
import re
import os
import networkx as nx
from concurrent.futures import ThreadPoolExecutor
from urllib.request import urlopen
import networkx as nx
from bs4 import BeautifulSoup
import os
import requests
from urllib.parse import urljoin

## Function to get character's names

In [None]:
raw_response=urlopen("https://onepiece.fandom.com/wiki/List_of_Canon_Characters")
response=raw_response.read()
soup=BeautifulSoup(response,"html.parser")
table = soup.find("table", class_="fandom-table sortable")

if table:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    names={}
    for row in rows:
        cols = row.find_all("td")
        if len(cols) > 1: 
            name_link = cols[1].find("a")
            if name_link:
                names[name_link.text] = name_link.get("href")

## Function to download and save the character's contents

In [None]:
base_url = "https://onepiece.fandom.com"

output_folder = "../onepiece"
os.makedirs(output_folder, exist_ok=True)
timeout_links = {}
for name, relative_link in names.items():
    full_url = urljoin(base_url, relative_link)
    try:
        response = requests.get(full_url,timeout=2)
        response.raise_for_status()
        file_path = os.path.join(output_folder, f"{name}.txt")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        print(f"Saved {name} page to {file_path}")

    except requests.exceptions.Timeout:
        timeout_links[name] = full_url

    except requests.exceptions.RequestException as e:
        print(f"Failed to download {name}: {e}")

In [None]:
# Function to read all file contents into a dictionary
def load_files_to_memory(file_paths):
    content_dict = {}
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            content_dict[os.path.splitext(os.path.basename(file_path))[0]] = file.read()
    return content_dict

# Function to process one file and find relationships
def process_file(person_name, file_content, all_names):
    edges = []
    for other_person in all_names:
        if other_person != person_name and other_person in file_content:
            edges.append((person_name, other_person))
    return edges

# Get file paths and names
file_paths = [os.path.join(r'./../onepiece', f) for f in os.listdir(os.path.join(r'./../onepiece'))]
file_names = [os.path.splitext(os.path.basename(f))[0] for f in file_paths]

# Load all file contents into memory
file_contents = load_files_to_memory(file_paths)

## Function to clean and save html content 

In [5]:
def extract_clean_content(html_content):
    """
    Extracts and cleans the main text content from the OnePiece Wiki HTML page.

    Args:
        html_content (str): The raw HTML content of the page.

    Returns:
        str: The cleaned and filtered text content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Locate the main content container
    content_div = soup.find('div', class_='mw-parser-output')
    if not content_div:
        return "Main content not found."
    
    # Remove unwanted elements like scripts, styles, asides, and the TOC (table of contents)
    for unwanted in content_div(['script', 'style', 'table', 'aside']):
        unwanted.extract()
    
    # Remove the "Contents" section explicitly
    toc_div = soup.find('div', {'id': 'toc', 'class': 'toc'})
    if toc_div:
        toc_div.extract()
    
    # Remove specific unwanted sections (e.g., Gallery, Trivia, References, etc.)
    unwanted_sections = ["Video Games", "Playable Appearance", "Gallery", "Merchandise","Translation and Dub Issues", "Trivia", "References", "Site Navigation"]
    for header in content_div.find_all(['h2', 'h3', 'h4']):  # Headers indicate sections
        if any(section in header.get_text() for section in unwanted_sections):
            # Remove header and its following sibling content
            for sibling in header.find_next_siblings():
                if sibling.name in ['h2', 'h3', 'h4']:  # Stop at next section
                    break
                sibling.extract()
            header.extract()

    # Get cleaned text content
    text_content = content_div.get_text(separator='\n', strip=True)

    # Remove bracketed references like [1], [6.0], [28], etc.
    text_content = re.sub(r'\[\d+(\.\d+)?\]', '', text_content)

    # Remove empty lines and normalize spacing
    text_content = "\n".join(line.strip() for line in text_content.splitlines() if line.strip())

    return text_content

In [None]:
def save_cleaned_content(html_contents, output_folder):
    """
    Cleans and saves the extracted content from a dictionary of HTML files.

    Args:
        html_contents (dict): A dictionary with filenames as keys and HTML content as values.
        output_folder (str): Path to the folder where cleaned files will be saved.
    """
    os.makedirs(output_folder, exist_ok=True)  
    
    for file_name, html_content in html_contents.items():
        # Clean the content
        cleaned_content = extract_clean_content(html_content)
        
        # Create the output file path
        file_path = os.path.join(output_folder, f"{file_name}.txt")
        
        # Save the cleaned content
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(cleaned_content)

output_folder = r"./../onepiece_cleaned"  
save_cleaned_content(file_contents, output_folder)