In [1]:
from urllib.request import urlopen
import matplotlib.pyplot as plt
import networkx as nx
from bs4 import BeautifulSoup

In [2]:
raw_response=urlopen("https://onepiece.fandom.com/wiki/List_of_Canon_Characters")
response=raw_response.read()

In [3]:
soup=BeautifulSoup(response,"html.parser")
table = soup.find("table", class_="fandom-table sortable")

if table:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    names={}
    for row in rows:
        cols = row.find_all("td")
        if len(cols) > 1: 
            name_link = cols[1].find("a")
            if name_link:
                names[name_link.text] = name_link.get("href")

In [None]:

arc_list = [
    "Amazon Lily Arc", "Arabasta Arc", "Arabasta Arc", "Arlong Park Arc", "Arlong Park Arc",
    "Baratie Arc", "Baratie Arc", "Dressrosa Arc", "Dressrosa Arc", "Drum Island Arc", 
    "Drum Island Arc", "Egghead Arc", "Egghead Arc", "Elbaph Arc", "Elbaph Arc", 
    "Enies Lobby Arc", "Enies Lobby Arc", "Fish-Man Island Arc", "Fish-Man Island Arc",
    "Impel Down Arc", "Impel Down Arc", "Jaya Arc", "Jaya Arc", "Levely Arc", "Levely Arc", 
    "Little Garden Arc", "Little Garden Arc", "Loguetown Arc", "Loguetown Arc", 
    "Long Ring Long Land Arc", "Long Ring Long Land Arc", "Marineford Arc", "Marineford Arc", 
    "Orange Town Arc", "Orange Town Arc", "Post-Enies Lobby Arc", "Post-Enies Lobby Arc", 
    "Post-War Arc", "Post-War Arc", "Punk Hazard Arc", "Punk Hazard Arc", 
    "Return to Sabaody Arc", "Return to Sabaody Arc", "Reverse Mountain Arc", 
    "Reverse Mountain Arc", "Romance Dawn Arc", "Romance Dawn Arc", 
    "Sabaody Archipelago Arc", "Sabaody Archipelago Arc", "Skypiea Arc", "Skypiea Arc", 
    "Syrup Village Arc", "Syrup Village Arc", "Thriller Bark Arc", "Thriller Bark Arc", 
    "Wano Country Arc", "Wano Country Arc", "Water 7 Arc", "Water 7 Arc", 
    "Whisky Peak Arc", "Whisky Peak Arc", "Whole Cake Island Arc", "Whole Cake Island Arc", 
    "Zou Arc", "Zou Arc"
]


unique_arcs = sorted(set(arc_list)) 
arc_dict = {i + 1: arc for i, arc in enumerate(unique_arcs)}


In [None]:
from bs4 import BeautifulSoup
import re
import networkx as nx
import os

# Initialize the dictionary of characters
person_dict = names  # Example characters

# Generate possible name variants (e.g., first name, last name) for each character
def generate_name_variants(person_dict):
    name_variants = {}
    for full_name in person_dict.keys():
        parts = full_name.split()  # Assuming the name is space-separated
        name_variants[full_name] = parts  # Store the full name and its split parts
    return name_variants

name_variants = generate_name_variants(person_dict)

# Define the folder containing HTML files
html_folder = "C:\\Users\\17675\\Desktop\\02805\\onepiece"

# Locate the specific <h2> tag whose child <span> contains "History"
def find_history_section(soup):
    for h2_tag in soup.find_all("h2"):  # Loop through all <h2> tags
        span = h2_tag.find("span", class_="mw-headline")  # Find the <span> tag
        if span and span.text.strip() == "History":  # Check if the span text is "History"
            return h2_tag  # Return the <h2> tag containing "History"
    return None  # Return None if the target <h2> tag is not found

# Prepare a dictionary to store arc relationship graphs
arc_graph_map = {}
arc_pattern = re.compile(r".*Arc$")  # Regular expression to match titles ending with "Arc"

# Create a regular expression pattern for a full name (to avoid incorrect matches of middle parts)
def create_full_name_pattern(full_name):
    return re.compile(rf'\b{re.escape(full_name)}\b', re.IGNORECASE)

# Create regular expression patterns for name variants
def create_name_variant_patterns(variants):
    return [re.compile(rf'\b{re.escape(variant)}\b', re.IGNORECASE) for variant in variants]

empty_person = []  # List to track persons with no matching history section

# Iterate through the HTML files
for html_file in os.listdir(html_folder):
    if html_file.endswith(".txt"):  # Only process text files
        person_name = os.path.splitext(html_file)[0]  # The file name corresponds to the person's name
        file_path = os.path.join(html_folder, html_file)
        
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()  # Read the file content
        
        # Parse the HTML content
        soup = BeautifulSoup(content, "html.parser")
        history_section = find_history_section(soup)

        if not history_section:  # If no "History" section is found, skip the person
            print(f"No 'History' section for {person_name}")
            empty_person.append(person_name)
            continue

        # Get content from the "History" section until the next <h2>
        current = history_section.find_next_sibling()
        history_content = []

        while current:
            if current.name == "h2":  # Stop when another <h2> is found
                break
            history_content.append(current)
            current = current.find_next_sibling()

        # Track which names have been processed (to determine first appearances)
        processed_names = set() 

        # Track fully matched full names
        matched_full_names = set()  

        for element in history_content:
            if element.name in ["h3", "h4"]:  # Process only <h3> or <h4> tags
                headline = element.find("span", class_="mw-headline")
                if headline and arc_pattern.match(headline.text):  # Check if it matches an "Arc"
                    current_arc = headline.text.strip()

                    # Initialize a graph for the arc if not already present
                    if current_arc not in arc_graph_map:
                        arc_graph_map[current_arc] = nx.Graph()

                    # Extract the arc content
                    arc_content = []
                    sibling = element.find_next_sibling()

                    while sibling and not (sibling.name in ["h3", "h4", "h2"] and sibling.find("span", class_="mw-headline")):
                        arc_content.append(sibling.text if sibling else "")
                        sibling = sibling.find_next_sibling()

                    # Join the content of the arc into a single string
                    arc_text = " ".join(arc_content)

                    # Find all links in the HTML
                    all_links = soup.find_all("a", href=True)
                    link_names = {link.get_text().strip() for link in all_links}

                    for full_name, variants in name_variants.items():
                        full_name_pattern = create_full_name_pattern(full_name)

                        # Skip the person if their full name has not been processed and no link is found
                        if full_name not in processed_names:
                            if full_name in link_names:  # If the full name appears with a link
                                processed_names.add(full_name)
                            else:
                                continue

                        # Strict full name match
                        if full_name_pattern.search(arc_text) and full_name != person_name and full_name in person_dict:
                            matched_full_names.add(full_name)  # Mark the full name as matched
                            arc_graph_map[current_arc].add_edge(person_name, full_name)

                        # Match name variants if the full name has already been matched
                        elif full_name in matched_full_names:
                            for variant in variants:
                                variant_pattern = create_name_variant_patterns([variant])[0]
                                if variant_pattern.search(arc_text) and full_name != person_name:
                                    arc_graph_map[current_arc].add_edge(person_name, full_name)
                                    break

        print(person_name)  # Print the current person's name


In [None]:
import requests  # Import the requests library for making HTTP requests
# Prepare to store new arc relationship graphs
arc_pattern = re.compile(r".*Arc$")  # Regular expression to match titles ending with "Arc"
sage_pattern = re.compile(r".*Saga$")  # Regular expression to match titles ending with "Saga"

# Function to generate possible name variants (e.g., first name, last name) for each person
def generate_name_variants(person_dict):
    name_variants = {}
    for full_name in person_dict.keys():
        parts = full_name.split()  # Assume that names are split by spaces
        name_variants[full_name] = parts  # Store full names and their component parts
    return name_variants

name_variants = generate_name_variants(names)  # names is your existing dictionary

# Set to store names that have been processed to avoid duplication
processed_names = set()

# Function to check if a text represents a person's name, considering links and first-time occurrences
def is_person_name(text, link_tag):
    # If it's the first occurrence and no link is found, it's not considered a name
    if text not in processed_names:
        processed_names.add(text)
        # Skip if no link exists
        if not link_tag:
            return False
    return True

# Function to generate a regular expression pattern for matching full names (to avoid partial matches)
def create_full_name_pattern(full_name):
    return re.compile(rf'\b{re.escape(full_name)}\b', re.IGNORECASE)

# Function to create regular expression patterns for name variants
def create_name_variant_patterns(variants):
    return [re.compile(rf'\b{re.escape(variant)}\b', re.IGNORECASE) for variant in variants]

# Loop through the names that are not found previously (empty_person)
for person_name in empty_person:
    print(person_name)
    if person_name not in names:
        print(f"URL for {person_name} not found in names dictionary.")
        continue

    # Get the new URL for the person's history section
    original_url = names[person_name]
    history_url = f"https://onepiece.fandom.com{original_url}/History"

    try:
        # Make a request to fetch the page content
        response = requests.get(history_url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all h2 tags, and filter those with titles ending in "Saga"
        h2_tags = soup.find_all("h2")
        sage_h2s = [h2 for h2 in h2_tags if h2.find("span", class_="mw-headline") and sage_pattern.match(h2.find("span", class_="mw-headline").text.strip())]

        # Skip if no matching "Saga" section is found
        if not sage_h2s:
            print(f"No 'Saga' section found for {person_name} at {history_url}.")
            continue
        
        # Iterate through the found "Saga" titles
        for arc_start in sage_h2s:

            current = arc_start.find_next_sibling()
            in_arc_section = True  # Enter a valid "Arc" section

            while current:
                if current.name == "h2":  # When a new h2 title is encountered
                    next_span = current.find("span", class_="mw-headline")
                    if next_span and not sage_pattern.match(next_span.text.strip()):
                        in_arc_section = False  # Leave the "Arc" section
                        break

                elif current.name == "h3" and in_arc_section:  # Check if it's an "Arc" title
                    headline = current.find("span", class_="mw-headline")
                    if headline and arc_pattern.match(headline.text.strip()):
                        current_arc = headline.text.strip()

                        # Initialize the graph for the current arc if it doesn't exist
                        if current_arc not in arc_graph_map:
                            arc_graph_map[current_arc] = nx.Graph()

                        # Extract the content of the arc
                        arc_text = []
                        sibling = current.find_next_sibling()
                        
                        # Iterate through content until the next "Saga" or "Arc" title is found
                        while sibling and not (
                            (sibling.name == "h2" and sibling.find("span", class_="mw-headline") and sage_pattern.match(
                                sibling.find("span", class_="mw-headline").text.strip()))  # New 'Saga' section found
                            or (sibling.name == "h3" and sibling.find("span", class_="mw-headline") and arc_pattern.match(
                                sibling.find("span", class_="mw-headline").text.strip()))  # New "Arc" section found
                            or (sibling.name == "h2" and sibling.find("span", class_="mw-headline") and not sage_pattern.match(
                                sibling.find("span", class_="mw-headline").text.strip()))  # Non-"Saga" h2 title found
                        ):
                            arc_text.append(sibling.text if sibling else "")
                            sibling = sibling.find_next_sibling()

                        # Join arc content into a single string
                        arc_text = " ".join(arc_text)

                        # Find all links and extract their names
                        all_links = soup.find_all("a", href=True)
                        link_names = {link.get_text().strip() for link in all_links}

                        # Find all matching person names
                        matched_full_names = set()  # Track fully matched names

                        for full_name, variants in name_variants.items():
                            full_name_pattern = create_full_name_pattern(full_name)

                            # Skip if the person's name is first encountered without a link
                            if full_name not in processed_names:
                                if full_name in link_names:  # If the name appears with a link
                                    processed_names.add(full_name)
                                else:
                                    continue

                            # Strict full name match
                            if full_name_pattern.search(arc_text) and full_name != person_name and full_name in names:
                                matched_full_names.add(full_name)  # Mark as matched
                                arc_graph_map[current_arc].add_edge(person_name, full_name)

                            # Match variants only if the full name has already matched
                            elif full_name in matched_full_names:
                                for variant in variants:
                                    variant_pattern = create_name_variant_patterns([variant])[0]
                                    if variant_pattern.search(arc_text) and full_name != person_name:
                                        arc_graph_map[current_arc].add_edge(person_name, full_name)
                                        break

                current = current.find_next_sibling()

    except requests.RequestException as e:
        print(f"Failed to fetch {history_url} for {person_name}: {e}")


Bartholomew Kuma
Boa Hancock
No 'Saga' section found for Boa Hancock at https://onepiece.fandom.com/wiki/Boa_Hancock/History.
Brook
Buggy
Caesar Clown
Charlotte Linlin
Crocodile
Donquixote Doflamingo
Edward Newgate
Franky
Gaikotsu Yukichi
No 'Saga' section found for Gaikotsu Yukichi at https://onepiece.fandom.com/wiki/Belly#Overview/History.
Jack-in-the-Box
No 'Saga' section found for Jack-in-the-Box at https://onepiece.fandom.com/wiki/Zombie#Jack-in-the-Box/History.
Jew Wall
Failed to fetch https://onepiece.fandom.com/wiki/Jew_Wall/History for Jew Wall: 404 Client Error: Not Found for url: https://onepiece.fandom.com/wiki/Jew_Wall/History
Jinbe
Kaidou
Kakunoshin
Failed to fetch https://onepiece.fandom.com/wiki/Kakunoshin/History for Kakunoshin: 404 Client Error: Not Found for url: https://onepiece.fandom.com/wiki/Kakunoshin/History
Kin'emon
No 'Saga' section found for Kin'emon at https://onepiece.fandom.com/wiki/Kin%27emon/History.
Kumaguchi Ichiro
No 'Saga' section found for Kumaguch

In [None]:
import re
import networkx as nx

# Standard arc names list
standard_arc_names = [
    "Amazon Lily Arc", "Arabasta Arc", "Arlong Park Arc", "Baratie Arc", "Dressrosa Arc", 
    "Drum Island Arc", "Egghead Arc", "Elbaph Arc", "Enies Lobby Arc", "Fish-Man Island Arc",
    "Impel Down Arc", "Jaya Arc", "Levely Arc", "Little Garden Arc", "Loguetown Arc", 
    "Long Ring Long Land Arc", "Marineford Arc", "Orange Town Arc", "Post-Enies Lobby Arc", 
    "Post-War Arc", "Punk Hazard Arc", "Return to Sabaody Arc", "Reverse Mountain Arc", 
    "Romance Dawn Arc", "Sabaody Archipelago Arc", "Skypiea Arc", "Syrup Village Arc", 
    "Thriller Bark Arc", "Wano Country Arc", "Water 7 Arc", "Whisky Peak Arc", "Whole Cake Island Arc", "Zou Arc"
]

# Create a dictionary mapping normalized names to the standard names
normalized_name_map = {name.replace(" ", "").replace("-", "").lower(): name for name in standard_arc_names}

# Function to normalize arc names (remove spaces and hyphens, and convert to lowercase)
def normalize_arc_name(name):
    return name.replace(" ", "").replace("-", "").lower()

# Merge similar arc graphs based on the normalized arc names
def merge_similar_arcs(arc_graph_map, normalized_name_map):
    to_delete = []  # List to store arcs that need to be deleted
    merged_graphs = {}  # Dictionary to store merged arc graphs

    for arc_name in arc_graph_map:
        # Normalize the arc name
        normalized_name = normalize_arc_name(arc_name)

        # Find the reference name corresponding to the normalized name
        reference_name = normalized_name_map.get(normalized_name, None)

        if reference_name:
            # If the normalized name matches a standard name, merge the graph
            if reference_name in merged_graphs:
                # Merge the current graph into the existing graph
                merged_graphs[reference_name] = nx.compose(merged_graphs[reference_name], arc_graph_map[arc_name])
                to_delete.append(arc_name)
            else:
                # If it's the first time encountering the standard name, add the graph
                merged_graphs[reference_name] = arc_graph_map[arc_name]
        else:
            # If no match is found for the standard name, delete the arc graph
            to_delete.append(arc_name)

    # Delete the unnecessary arc graphs
    for arc_name in to_delete:
        del arc_graph_map[arc_name]

    # Add the merged graphs to the arc_graph_map
    arc_graph_map.update(merged_graphs)

# Call the function to merge similar arcs
merge_similar_arcs(arc_graph_map, normalized_name_map)

# Print the merged results
for arc_name, graph in arc_graph_map.items():
    print(f"{arc_name} with {len(graph.nodes)} nodes and {len(graph.edges)} edges")


Marineford Arc with 165 nodes and 623 edges
Post-War Arc with 166 nodes and 261 edges
Zou Arc with 127 nodes and 419 edges
Dressrosa Arc with 188 nodes and 1185 edges
Thriller Bark Arc with 78 nodes and 350 edges
Wano Country Arc with 439 nodes and 2279 edges
Fish-Man Island Arc with 94 nodes and 433 edges
Levely Arc with 131 nodes and 352 edges
Egghead Arc with 311 nodes and 1057 edges
Skypiea Arc with 65 nodes and 261 edges
Impel Down Arc with 76 nodes and 241 edges
Little Garden Arc with 37 nodes and 102 edges
Drum Island Arc with 34 nodes and 103 edges
Arabasta Arc with 82 nodes and 347 edges
Whole Cake Island Arc with 162 nodes and 869 edges
Return to Sabaody Arc with 50 nodes and 116 edges
Punk Hazard Arc with 73 nodes and 285 edges
Romance Dawn Arc with 32 nodes and 65 edges
Loguetown Arc with 55 nodes and 121 edges
Jaya Arc with 75 nodes and 210 edges
Amazon Lily Arc with 58 nodes and 138 edges
Enies Lobby Arc with 85 nodes and 412 edges
Sabaody Archipelago Arc with 99 nodes an