In [20]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [25]:
def format_character_name(character_name):
    # Replace specific character name patterns with correct URL-friendly format
    formatted_name = character_name.strip().lower()

    # Custom mappings for character names requiring special URL formatting
    name_mappings = {
        'clone sergeant - phase i': 'clone-sergeant-phase-i',
        'ig-12 & grogu': 'ig-12-grogu',
        'threepio & chewie': 'threepio-chewie'
    }

    # Check if the character name requires a custom URL format
    if formatted_name in name_mappings:
        return name_mappings[formatted_name]
    
    # Remove any characters that are not alphanumeric, spaces, or hyphens
    formatted_name = re.sub(r'[^\w\s-]', '', formatted_name)
    
    # Replace spaces with dashes
    formatted_name = formatted_name.replace(' ', '-')

    return formatted_name

In [22]:
def extract_character_info(character_name):
    # Format the character name for constructing the URL
    formatted_name = format_character_name(character_name)

    # Construct the URL for the character's gear list page
    character_url = f'https://swgoh.gg/characters/{formatted_name}/gear-list/'

    # Send an HTTP GET request to the character's gear list page
    response = requests.get(character_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract character alignment (Dark Side or Light Side)
        alignment_tag = soup.find('a', class_='btn btn-sm btn-default-outline')
        alignment = alignment_tag.text.strip() if alignment_tag else None

        # Extract counts of shock prod and battle computer kyrotechs
        shock_prods = 0
        battle_computers = 0

        # Find all <li> tags with class 'media list-group-item p-0 character'
        gear_items = soup.find_all('li', class_='media list-group-item p-0 character')

        for item in gear_items:
            # Extract the gear type and count
            gear_name = item.find('h5').text.strip()
            gear_count = int(item.find('p').text.strip().replace('x', ''))

            if 'Shock Prod' in gear_name:
                shock_prods += gear_count
            elif 'Battle Computer' in gear_name:
                battle_computers += gear_count

        return {
            'Character Name': character_name,
            'Alignment': alignment,
            'Shock Prods Count': shock_prods,
            'Battle Computers Count': battle_computers
        }
    else:
        print(f"Failed to retrieve data from {character_url}. Status code: {response.status_code}")
        return None


In [23]:
def scrape_all_characters():
    # Send an HTTP GET request to the main page (swgoh.gg)
    response = requests.get('https://swgoh.gg/')

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <h5> tags containing character names
        character_name_tags = soup.find_all('h5')

        characters_info = []

        for tag in character_name_tags:
            character_name = tag.text.strip()

            # Extract information for the character
            character_info = extract_character_info(character_name)

            if character_info:
                characters_info.append(character_info)

        return characters_info
    else:
        print(f"Failed to retrieve data from swgoh.gg. Status code: {response.status_code}")
        return None

In [26]:
# Scrape all characters and their information from swgoh.gg
all_characters_info = scrape_all_characters()

if all_characters_info:
    # Create a DataFrame from the extracted character information
    df = pd.DataFrame(all_characters_info)

    # Define the path to save the Excel file
    excel_file_path = 'swgoh_characters_info.xlsx'

    # Export the DataFrame to an Excel file
    df.to_excel(excel_file_path, index=False)

    print(f"Character information exported to '{excel_file_path}'.")
else:
    print("No character information retrieved.")

Failed to retrieve data from https://swgoh.gg/characters/latest-characters/gear-list/. Status code: 404
Failed to retrieve data from https://swgoh.gg/characters/latest-ships/gear-list/. Status code: 404
Character information exported to 'swgoh_characters_info.xlsx'.
