In [None]:
import requests
from bs4 import BeautifulSoup
from typing import Optional, Dict, Any


def get_golgg_game_stats(game_id: int) -> Optional[Dict[str, Any]]:
    """
    Scrape game statistics from gol.gg for a specific game ID.

    Args:
        game_id: The unique identifier for the game on gol.gg
                (e.g., 64750 from https://gol.gg/game/stats/64750/page-game/)

    Returns:
        A dictionary containing the parsed game data, or None if the request failed

    Example:
        game_data = get_golgg_game_stats(64750)
    """
    # Construct the URL with the provided game ID
    url = f"https://gol.gg/game/stats/{game_id}/page-game/"

    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
    headers = {"User-Agent": user_agent}

    # Send HTTP request
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for 4XX/5XX responses
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    return soup

In [None]:
soup = get_golgg_game_stats(64750)
print(soup)

In [None]:
def get_game_duration(soup: BeautifulSoup) -> Optional[str]:
    """
    Extract the game duration from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Game duration as a string (e.g., "44:59") or None if not found
    """
    # Find the div containing "Game Time" text
    game_time_div = soup.find(string="Game Time")

    if game_time_div:
        # Navigate to the h1 element containing the duration
        h1_element = game_time_div.find_next("h1")
        if h1_element:
            return h1_element.text.strip()

    return None


def parse_game_duration(duration: str) -> int:
    """
    Parse game duration string into total seconds.

    Args:
        duration: Game duration string (e.g., "44:59")

    Returns:
        Duration in seconds (e.g., 2699)
    """
    total_seconds = 0

    if duration:
        time_parts = duration.split(":")
        if len(time_parts) == 2:
            minutes = int(time_parts[0])
            seconds = int(time_parts[1])
            total_seconds = minutes * 60 + seconds

    return total_seconds


def get_game_patch(soup: BeautifulSoup) -> Optional[str]:
    """
    Extract the game patch version from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Game patch as a string (e.g., "v15.4") or None if not found
    """
    # Find the div containing "Game Time" text
    game_time_div = soup.find(string="Game Time")

    if game_time_div:
        # Navigate to the parent elements to find the patch div
        row_div = game_time_div.find_parent("div").find_parent("div").find_parent("div")
        if row_div:
            # Find the div with class "col-3 text-right" which contains the patch
            patch_div = row_div.find("div", class_="col-3 text-right")
            if patch_div:
                return patch_div.text.strip()

    return None


def parse_game_version(patch: str) -> tuple[int, int]:
    """
    Parse game patch version into major and minor components.

    Args:
        patch: Game patch version string (e.g., "v15.4")

    Returns:
        Tuple containing (major_patch, minor_patch)
    """
    major_patch = 0
    minor_patch = 0

    if patch and patch.startswith("v"):
        version_parts = patch[1:].split(".")
        if len(version_parts) >= 1:
            major_patch = int(version_parts[0])
        if len(version_parts) >= 2:
            minor_patch = int(version_parts[1])

    return major_patch, minor_patch


patch = get_game_patch(soup)
duration = get_game_duration(soup)

print(patch, duration)

print(parse_game_version(patch))
print(parse_game_duration(duration))

In [None]:
def get_team_info(soup: BeautifulSoup) -> Dict[str, Any]:
    """
    Extract team information from the parsed HTML, including team names and winner.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Dictionary containing:
        - blue_team: Name of the blue team
        - red_team: Name of the red team
        - blue_won: 1 if blue team won, 0 if lost
    """
    team_info = {"blue_team": None, "red_team": None, "blue_won": None}

    # Find the blue team div
    blue_div = soup.find("div", class_="blue-line-header")
    if blue_div:
        # Extract team name from the anchor tag
        blue_team_anchor = blue_div.find("a")
        if blue_team_anchor:
            team_info["blue_team"] = blue_team_anchor.text.strip()

        # Check if blue team won
        blue_result = blue_div.text.strip()
        team_info["blue_won"] = 1 if "- WIN" in blue_result else 0

    # Find the red team div
    red_div = soup.find("div", class_="red-line-header")
    if red_div:
        # Extract team name from the anchor tag
        red_team_anchor = red_div.find("a")
        if red_team_anchor:
            team_info["red_team"] = red_team_anchor.text.strip()

    return team_info


team_info = get_team_info(soup)

print(team_info)

In [None]:
def get_champion_names(soup: BeautifulSoup) -> list[str]:
    """
    Extract the champion names from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        List of 10 champion names in order: blue team (top to support) followed by red team (top to support)
    """
    champion_names = []

    # Find the tables with class "playersInfosLine"
    player_tables = soup.find_all("table", class_="playersInfosLine")

    # Process each table (blue team and red team)
    for table in player_tables:
        # Find all rows in the table (skip the header row)
        rows = table.find_all("tr")
        # Skip the header row (first row)
        for row in rows[1:]:  # Start from index 1 to skip header
            # Find the champion image element
            champion_img = row.find("img", class_="champion_icon rounded-circle")
            if champion_img:
                # Get the champion name from the alt attribute
                champion_name = champion_img.get("alt")
                champion_names.append(champion_name)

    return champion_names


champion_names = get_champion_names(soup)

print(champion_names)

In [None]:
from typing import List, Optional, Dict, Any
from utils.rl.champions import Champion
import difflib  # For fuzzy string matching


def map_champion_names_to_ids(champion_names: List[str]) -> List[int]:
    """
    Map champion names to their corresponding IDs using the Champion enum with fuzzy matching.

    Args:
        champion_names: List of champion names

    Returns:
        List of champion IDs in the same order

    Raises:
        ValueError: If a champion name cannot be mapped to an ID
    """
    champion_ids = []

    # Create a mapping of display names to champion IDs for quick lookup
    name_to_id_map = {champion.display_name: champion.id for champion in Champion}
    all_champion_names = list(name_to_id_map.keys())

    # Create a reverse mapping for easier lookup (normalized name -> display name)
    normalized_name_map = {}
    for display_name in all_champion_names:
        # Store both the lowercase version and a version with apostrophes removed
        normalized_name_map[display_name.lower()] = display_name
        normalized_name_map[display_name.lower().replace("'", "")] = display_name

    # Special case handling for known mismatches
    special_cases = {"nunu": "Nunu & Willump"}

    for name in champion_names:
        champion_id = None

        # Handle special cases
        if name.lower() in special_cases:
            name = special_cases[name.lower()]

        # Try direct lookup first
        if name in name_to_id_map:
            champion_id = name_to_id_map[name]

        # Try normalized lookup (handles case differences and missing apostrophes)
        elif name.lower() in normalized_name_map:
            display_name = normalized_name_map[name.lower()]
            champion_id = name_to_id_map[display_name]

        # Try without apostrophes
        elif name.lower().replace("'", "") in normalized_name_map:
            display_name = normalized_name_map[name.lower().replace("'", "")]
            champion_id = name_to_id_map[display_name]

        # If still not found, try fuzzy matching
        if champion_id is None:
            # Get the closest match using difflib
            closest_matches = difflib.get_close_matches(
                name, all_champion_names, n=1, cutoff=0.6
            )

            if closest_matches:
                closest_match = closest_matches[0]
                champion_id = name_to_id_map[closest_match]
                print(
                    f"Warning: Using fuzzy match for '{name}' -> '{closest_match}' (ID: {champion_id})"
                )
            else:
                # If no match found, raise an error
                raise ValueError(f"Could not map champion name '{name}' to an ID.")

        champion_ids.append(champion_id)

    return champion_ids


champion_ids = map_champion_names_to_ids(champion_names)

print(champion_ids)

In [None]:
def get_tournament_name(soup: BeautifulSoup) -> Optional[str]:
    """
    Extract the tournament name from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Tournament name as a string or None if not found
    """
    # Find the anchor tag with the specific href pattern
    tournament_anchor = soup.find(
        "a", href=lambda href: href and "../tournament/tournament-stats" in href
    )

    if tournament_anchor:
        return tournament_anchor.text.strip()

    return None


# Example usage:
tournament_name = get_tournament_name(soup)
print(tournament_name)

# Full scraping


In [None]:
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Optional, Tuple
import random
from tqdm import tqdm  # For progress bar


def scrape_golgg_games(
    start_game_id: int,
    output_file_path: str,
    min_major_version: int = 14,
    request_delay: Tuple[float, float] = (1.0, 3.0),
    limit: int = None,
) -> pd.DataFrame:
    """
    Scrape game data from gol.gg starting from a specific game ID and working backwards.
    Save the results to a parquet file.

    Args:
        start_game_id: The game ID to start scraping from
        output_file_path: Path where the parquet file will be saved
        min_major_version: Minimum game major version to scrape (default: 14)
        request_delay: Tuple of (min, max) seconds to delay between requests

    Returns:
        DataFrame containing all scraped game data
    """
    # Check if the parquet file already exists and load it
    if os.path.exists(output_file_path):
        existing_df = pd.read_parquet(output_file_path)
        print(f"Loaded existing data with {len(existing_df)} games")

        # Get the set of game IDs that we already have
        existing_game_ids = set(existing_df["golgg_id"].tolist())
    else:
        existing_df = pd.DataFrame(
            columns=[
                "golgg_id",
                "champion_ids",
                "gameVersionMajorPatch",
                "gameVersionMinorPatch",
                "gameDuration",
                "blueTeamName",
                "redTeamName",
                "tournamentName",
                "team_100_win",
            ]
        )
        existing_game_ids = set()
        print("No existing data found, creating new dataset")

    # Create a list to store new game data
    new_games_data = []

    # Initialize the current game ID
    current_id = start_game_id

    try:
        # Main scraping loop
        with tqdm(desc="Scraping games") as pbar:
            while current_id > 0:
                # Skip if we already have this game ID
                if current_id in existing_game_ids:
                    print(f"Game ID {current_id} already exists in dataset. Stopping.")
                    break

                # Add random delay to avoid overloading the server
                time.sleep(random.uniform(request_delay[0], request_delay[1]))

                # Get game data
                soup = get_golgg_game_stats(current_id)

                # If game not found (404/302), skip to next ID
                if soup is None:
                    print(f"Game ID {current_id} not found. Skipping.")
                    current_id -= 1
                    continue

                try:
                    # Extract patch version
                    patch = get_game_patch(soup)
                    if patch:
                        major_patch, minor_patch = parse_game_version(patch)

                        # Check if we've reached a version below our minimum
                        if major_patch < min_major_version:
                            print(
                                f"Reached game with version {patch} below minimum {min_major_version}. Stopping."
                            )
                            break
                    else:
                        # If patch info not available, skip
                        print(f"No patch info for game ID {current_id}. Skipping.")
                        current_id -= 1
                        continue

                    # Extract other game data
                    duration_str = get_game_duration(soup)
                    duration_seconds = (
                        parse_game_duration(duration_str) if duration_str else 0
                    )

                    team_info = get_team_info(soup)
                    champion_names = get_champion_names(soup)

                    # Map champion names to IDs
                    try:
                        champion_ids = map_champion_names_to_ids(champion_names)
                    except ValueError as e:
                        print(
                            f"Error mapping champion names for game ID {current_id}: {e}. Skipping."
                        )
                        current_id -= 1
                        continue

                    tournament_name = get_tournament_name(soup)

                    # Create a record for this game
                    game_data = {
                        "golgg_id": current_id,
                        "champion_ids": champion_ids,
                        "gameVersionMajorPatch": major_patch,
                        "gameVersionMinorPatch": minor_patch,
                        "gameDuration": duration_seconds,
                        "blueTeamName": team_info.get("blue_team", ""),
                        "redTeamName": team_info.get("red_team", ""),
                        "tournamentName": tournament_name or "",
                        "team_100_win": team_info.get("blue_won", None),
                    }

                    # Append to our list of new games
                    new_games_data.append(game_data)

                    # Every 100 games, save progress
                    if len(new_games_data) % 100 == 0:
                        # Combine existing data with new data
                        combined_df = pd.concat(
                            [existing_df, pd.DataFrame(new_games_data)],
                            ignore_index=True,
                        )

                        # Save to parquet
                        combined_df.to_parquet(output_file_path, index=False)
                        print(f"Saved progress: {len(combined_df)} total games")

                    # Update progress bar
                    pbar.update(1)
                    pbar.set_postfix(
                        {
                            "game_id": current_id,
                            "version": f"v{major_patch}.{minor_patch}",
                        }
                    )

                except Exception as e:
                    print(f"Error processing game ID {current_id}: {e}. Skipping.")

                # Move to the previous game ID
                current_id -= 1

    except KeyboardInterrupt:
        print("\nScraping interrupted by user")

    finally:
        # Save final results if we have new data
        if new_games_data:
            # Combine existing data with new data
            final_df = pd.concat(
                [existing_df, pd.DataFrame(new_games_data)], ignore_index=True
            )

            # Save to parquet
            final_df.to_parquet(output_file_path, index=False)
            print(f"Final dataset saved with {len(final_df)} games")
            return final_df
        else:
            print("No new data collected")
            return existing_df

In [None]:
from utils.match_prediction import RAW_PRO_GAMES_DIR

# Define starting game ID (latest game from website)
latest_game_id = 64545  # Replace with the latest game ID you find

output_file_path = os.path.join(RAW_PRO_GAMES_DIR, "pro_games.parquet")

# Run the scraper
pro_games_df = scrape_golgg_games(
    start_game_id=latest_game_id,
    output_file_path=output_file_path,
    min_major_version=14,
    request_delay=(
        1.0,
        3.0,
    ),  # Random delay between 1-3 seconds to be respectful to the server
)

# Display summary of collected data
print(f"Total games collected: {len(pro_games_df)}")
print(f"Unique tournaments: {pro_games_df['tournamentName'].nunique()}")
print(f"Games by major version:")
print(pro_games_df.groupby("gameVersionMajorPatch").size())

In [None]:
pro_games_df.head()

# fix nunu imports

In [None]:
#!/usr/bin/env python3
import re
import sys


def extract_failed_nunu_ids(log_file_path):
    """
    Extract game IDs that failed due to Nunu mapping issues from a log file.

    Args:
        log_file_path: Path to the log file

    Returns:
        List of game IDs that failed
    """
    failed_ids = []

    # Regular expression to match the error lines and extract the game ID
    pattern = r"Error mapping champion names for game ID (\d+): Could not map champion name 'Nunu' to an ID\."

    with open(log_file_path, "r") as f:
        for line in f:
            match = re.search(pattern, line)
            if match:
                game_id = int(match.group(1))
                failed_ids.append(game_id)

    return failed_ids



log_file_path = "/Users/loyd/nunu.txt"

# Extract the failed IDs
failed_ids = extract_failed_nunu_ids(log_file_path)

In [None]:
print(failed_ids)

In [None]:
def add_specific_game_ids(
    game_ids: List[int],
    output_file_path: str,
    request_delay: Tuple[float, float] = (1.0, 3.0),
) -> pd.DataFrame:
    """
    Add specific game IDs to the dataset that may have previously failed.

    Args:
        game_ids: List of game IDs to process and add
        output_file_path: Path to the parquet file to update
        request_delay: Tuple of (min, max) seconds to delay between requests

    Returns:
        Updated DataFrame containing all game data
    """
    # Load existing data if it exists
    if os.path.exists(output_file_path):
        existing_df = pd.read_parquet(output_file_path)
        print(f"Loaded existing data with {len(existing_df)} games")

        # Get the set of game IDs that we already have
        existing_game_ids = set(existing_df["golgg_id"].tolist())
    else:
        existing_df = pd.DataFrame(
            columns=[
                "golgg_id",
                "champion_ids",
                "gameVersionMajorPatch",
                "gameVersionMinorPatch",
                "gameDuration",
                "blueTeamName",
                "redTeamName",
                "tournamentName",
                "team_100_win",
            ]
        )
        existing_game_ids = set()
        print("No existing data found, creating new dataset")

    # Create a list to store new game data
    new_games_data = []

    # Filter out game IDs that already exist in the dataset
    ids_to_process = [
        game_id for game_id in game_ids if game_id not in existing_game_ids
    ]

    if not ids_to_process:
        print("All specified game IDs already exist in the dataset. Nothing to add.")
        return existing_df

    print(f"Processing {len(ids_to_process)} new game IDs...")

    try:
        # Process each game ID
        with tqdm(total=len(ids_to_process), desc="Adding games") as pbar:
            for current_id in ids_to_process:
                # Add random delay to avoid overloading the server
                time.sleep(random.uniform(request_delay[0], request_delay[1]))

                # Get game data
                soup = get_golgg_game_stats(current_id)

                # If game not found (404/302), skip to next ID
                if soup is None:
                    print(f"Game ID {current_id} not found. Skipping.")
                    pbar.update(1)
                    continue

                try:
                    # Extract patch version
                    patch = get_game_patch(soup)
                    if patch:
                        major_patch, minor_patch = parse_game_version(patch)
                    else:
                        # If patch info not available, skip
                        print(f"No patch info for game ID {current_id}. Skipping.")
                        pbar.update(1)
                        continue

                    # Extract other game data
                    duration_str = get_game_duration(soup)
                    duration_seconds = (
                        parse_game_duration(duration_str) if duration_str else 0
                    )

                    team_info = get_team_info(soup)
                    champion_names = get_champion_names(soup)

                    # Map champion names to IDs with improved function
                    try:
                        champion_ids = map_champion_names_to_ids(champion_names)
                    except ValueError as e:
                        print(
                            f"Error mapping champion names for game ID {current_id}: {e}. Skipping."
                        )
                        pbar.update(1)
                        continue

                    tournament_name = get_tournament_name(soup)

                    # Create a record for this game
                    game_data = {
                        "golgg_id": current_id,
                        "champion_ids": champion_ids,
                        "gameVersionMajorPatch": major_patch,
                        "gameVersionMinorPatch": minor_patch,
                        "gameDuration": duration_seconds,
                        "blueTeamName": team_info.get("blue_team", ""),
                        "redTeamName": team_info.get("red_team", ""),
                        "tournamentName": tournament_name or "",
                        "team_100_win": team_info.get("blue_won", None),
                    }

                    # Append to our list of new games
                    new_games_data.append(game_data)
                    pbar.update(1)

                except Exception as e:
                    print(f"Error processing game ID {current_id}: {e}. Skipping.")
                    pbar.update(1)

    except KeyboardInterrupt:
        print("\nProcess interrupted by user")

    finally:
        # Save final results if we have new data
        if new_games_data:
            # Combine existing data with new data
            final_df = pd.concat(
                [existing_df, pd.DataFrame(new_games_data)], ignore_index=True
            )

            # Save to parquet
            final_df.to_parquet(output_file_path, index=False)
            print(f"Final dataset saved with {len(final_df)} games")
            return final_df
        else:
            print("No new data collected")
            return existing_df

In [None]:
from utils.match_prediction import RAW_PRO_GAMES_DIR
import os

# Path to your dataset
output_file_path = os.path.join(RAW_PRO_GAMES_DIR, "pro_games.parquet")


# Process the failed game IDs and add them to the dataset
updated_df = add_specific_game_ids(
    game_ids=failed_ids,
    output_file_path=output_file_path,
    request_delay=(1.0, 3.0),  # Random delay between 1-3 seconds
)

# Display summary of the updated dataset
print(f"Total games in updated dataset: {len(updated_df)}")
print(f"Unique tournaments: {updated_df['tournamentName'].nunique()}")
print(f"Games by major version:")
print(updated_df.groupby("gameVersionMajorPatch").size())