In [None]:
from urllib.request import urlopen
import matplotlib.pyplot as plt
import networkx as nx
from bs4 import BeautifulSoup 
from urllib.parse import urljoin  

### 1.Crawl all role names and URL links from the role list page

In [None]:
raw_response=urlopen("https://onepiece.fandom.com/wiki/List_of_Canon_Characters")
response=raw_response.read()
soup=BeautifulSoup(response,"html.parser")
# Find the first table element in the HTML with the class "fandom-table sortable"
table = soup.find("table", class_="fandom-table sortable")

# Check if the table exists
if table:
    # Find the <tbody> element within the table, which contains the table rows
    tbody = table.find("tbody")

    # Find all rows (<tr> elements) inside the <tbody>
    rows = tbody.find_all("tr")

    # Initialize an empty dictionary to store character names and their links
    names = {}

    # Iterate through each row in the table
    for row in rows:
        # Find all columns (<td> elements) within the current row
        cols = row.find_all("td")

        # Check if the row contains more than one column (to ensure valid data)
        if len(cols) > 1:
            # Find the <a> (link) element within the second column
            name_link = cols[1].find("a")

            # If a valid link is found
            if name_link:
                # Use the text of the link as the key (character name)
                # Use the "href" attribute of the link as the value (relative URL to the character's page)
                names[name_link.text] = name_link.get("href")

### 2.Save the main page of each character to a txt file

In [None]:
# Base URL of the One Piece Wiki
base_url = "https://onepiece.fandom.com"

# Define the output folder to save the downloaded pages
output_folder = ".\\onepiece"
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't already exist

# Dictionary to keep track of links that timed out
timeout_links = {}

# Iterate through the `names` dictionary, which contains character names as keys and relative links as values
for name, relative_link in names.items():
    # Construct the full URL by combining the base URL and the relative link
    full_url = urljoin(base_url, relative_link)

    try:
        # Send a GET request to fetch the web page, with a timeout of 2 seconds
        response = requests.get(full_url, timeout=2)
        # Raise an exception for HTTP errors
        response.raise_for_status()

        # Define the file path to save the page content
        file_path = os.path.join(output_folder, f"{name}.txt")
        # Write the page content to a text file in UTF-8 encoding
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        # Print a success message
        print(f"Saved {name} page to {file_path}")

    # Handle timeout errors
    except requests.exceptions.Timeout:
        timeout_links[name] = full_url  # Record the timed-out link in `timeout_links`

    # Handle other types of HTTP request exceptions
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {name}: {e}")  # Print an error message
