There is no official API for pro games, but the data is available in the lolesports website. However gol.gg has a better format.

In [1]:
import requests
from bs4 import BeautifulSoup
from typing import Optional, Dict, Any


def get_golgg_game_stats(game_id: int) -> Optional[Dict[str, Any]]:
    """
    Scrape game statistics from gol.gg for a specific game ID.

    Args:
        game_id: The unique identifier for the game on gol.gg
                (e.g., 64750 from https://gol.gg/game/stats/64750/page-game/)

    Returns:
        A dictionary containing the parsed game data, or None if the request failed

    Example:
        game_data = get_golgg_game_stats(64750)
    """
    # Construct the URL with the provided game ID
    url = f"https://gol.gg/game/stats/{game_id}/page-game/"

    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
    headers = {"User-Agent": user_agent}

    # Send HTTP request
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for 4XX/5XX responses
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    return soup

In [2]:
soup = get_golgg_game_stats(64750)

In [3]:
def get_game_duration(soup: BeautifulSoup) -> Optional[str]:
    """
    Extract the game duration from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Game duration as a string (e.g., "44:59") or None if not found
    """
    # Find the div containing "Game Time" text
    game_time_div = soup.find(string="Game Time")

    if game_time_div:
        # Navigate to the h1 element containing the duration
        h1_element = game_time_div.find_next("h1")
        if h1_element:
            return h1_element.text.strip()

    return None


def parse_game_duration(duration: str) -> int:
    """
    Parse game duration string into total seconds.

    Args:
        duration: Game duration string (e.g., "44:59")

    Returns:
        Duration in seconds (e.g., 2699)
    """
    total_seconds = 0

    if duration:
        time_parts = duration.split(":")
        if len(time_parts) == 2:
            minutes = int(time_parts[0])
            seconds = int(time_parts[1])
            total_seconds = minutes * 60 + seconds

    return total_seconds


def get_game_patch(soup: BeautifulSoup) -> Optional[str]:
    """
    Extract the game patch version from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Game patch as a string (e.g., "v15.4") or None if not found
    """
    # Find the div containing "Game Time" text
    game_time_div = soup.find(string="Game Time")

    if game_time_div:
        # Navigate to the parent elements to find the patch div
        row_div = game_time_div.find_parent("div").find_parent("div").find_parent("div")
        if row_div:
            # Find the div with class "col-3 text-right" which contains the patch
            patch_div = row_div.find("div", class_="col-3 text-right")
            if patch_div:
                return patch_div.text.strip()

    return None


def parse_game_version(patch: str) -> tuple[int, int]:
    """
    Parse game patch version into major and minor components.

    Args:
        patch: Game patch version string (e.g., "v15.4")

    Returns:
        Tuple containing (major_patch, minor_patch)
    """
    major_patch = 0
    minor_patch = 0

    if patch and patch.startswith("v"):
        version_parts = patch[1:].split(".")
        if len(version_parts) >= 1:
            major_patch = int(version_parts[0])
        if len(version_parts) >= 2:
            minor_patch = int(version_parts[1])

    return major_patch, minor_patch


patch = get_game_patch(soup)
duration = get_game_duration(soup)

print(patch, duration)

print(parse_game_version(patch))
print(parse_game_duration(duration))

v15.4 44:59
(15, 4)
2699


In [4]:
def get_team_info(soup: BeautifulSoup) -> Dict[str, Any]:
    """
    Extract team information from the parsed HTML, including team names and winner.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Dictionary containing:
        - blue_team: Name of the blue team
        - red_team: Name of the red team
        - blue_won: 1 if blue team won, 0 if lost
    """
    team_info = {"blue_team": None, "red_team": None, "blue_won": None}

    # Find the blue team div
    blue_div = soup.find("div", class_="blue-line-header")
    if blue_div:
        # Extract team name from the anchor tag
        blue_team_anchor = blue_div.find("a")
        if blue_team_anchor:
            team_info["blue_team"] = blue_team_anchor.text.strip()

        # Check if blue team won
        blue_result = blue_div.text.strip()
        team_info["blue_won"] = 1 if "- WIN" in blue_result else 0

    # Find the red team div
    red_div = soup.find("div", class_="red-line-header")
    if red_div:
        # Extract team name from the anchor tag
        red_team_anchor = red_div.find("a")
        if red_team_anchor:
            team_info["red_team"] = red_team_anchor.text.strip()

    return team_info


team_info = get_team_info(soup)

print(team_info)

{'blue_team': 'NORD Esports', 'red_team': 'Los Ratones', 'blue_won': 0}


In [5]:
def get_champion_names(soup: BeautifulSoup) -> list[str]:
    """
    Extract the champion names from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        List of 10 champion names in order: blue team (top to support) followed by red team (top to support)
    """
    champion_names = []

    # Find the tables with class "playersInfosLine"
    player_tables = soup.find_all("table", class_="playersInfosLine")

    # Process each table (blue team and red team)
    for table in player_tables:
        # Find all rows in the table (skip the header row)
        rows = table.find_all("tr")
        # Skip the header row (first row)
        for row in rows[1:]:  # Start from index 1 to skip header
            # Find the champion image element
            champion_img = row.find("img", class_="champion_icon rounded-circle")
            if champion_img:
                # Get the champion name from the alt attribute
                champion_name = champion_img.get("alt")
                champion_names.append(champion_name)

    return champion_names


champion_names = get_champion_names(soup)

print(champion_names)

['KSante', 'Sejuani', 'VelKoz', 'Ashe', 'Pantheon', 'Jax', 'Wukong', 'Orianna', 'Xayah', 'Janna']


In [6]:
from typing import List, Optional, Dict, Any
from utils.match_prediction.champions import Champion
import difflib  # For fuzzy string matching


def map_champion_names_to_ids(champion_names: List[str]) -> List[int]:
    """
    Map champion names to their corresponding IDs using the Champion enum with fuzzy matching.

    Args:
        champion_names: List of champion names

    Returns:
        List of champion IDs in the same order

    Raises:
        ValueError: If a champion name cannot be mapped to an ID
    """
    champion_ids = []

    # Create a mapping of display names to champion IDs for quick lookup
    name_to_id_map = {champion.display_name: champion.id for champion in Champion}
    all_champion_names = list(name_to_id_map.keys())

    # Create a reverse mapping for easier lookup (normalized name -> display name)
    normalized_name_map = {}
    for display_name in all_champion_names:
        # Store both the lowercase version and a version with apostrophes removed
        normalized_name_map[display_name.lower()] = display_name
        normalized_name_map[display_name.lower().replace("'", "")] = display_name

    # Special case handling for known mismatches
    special_cases = {"nunu": "Nunu & Willump"}

    for name in champion_names:
        champion_id = None

        # Handle special cases
        if name.lower() in special_cases:
            name = special_cases[name.lower()]

        # Try direct lookup first
        if name in name_to_id_map:
            champion_id = name_to_id_map[name]

        # Try normalized lookup (handles case differences and missing apostrophes)
        elif name.lower() in normalized_name_map:
            display_name = normalized_name_map[name.lower()]
            champion_id = name_to_id_map[display_name]

        # Try without apostrophes
        elif name.lower().replace("'", "") in normalized_name_map:
            display_name = normalized_name_map[name.lower().replace("'", "")]
            champion_id = name_to_id_map[display_name]

        # If still not found, try fuzzy matching
        if champion_id is None:
            # Get the closest match using difflib
            closest_matches = difflib.get_close_matches(
                name, all_champion_names, n=1, cutoff=0.6
            )

            if closest_matches:
                closest_match = closest_matches[0]
                champion_id = name_to_id_map[closest_match]
                print(
                    f"Warning: Using fuzzy match for '{name}' -> '{closest_match}' (ID: {champion_id})"
                )
            else:
                # If no match found, raise an error
                raise ValueError(f"Could not map champion name '{name}' to an ID.")

        champion_ids.append(champion_id)

    return champion_ids


champion_ids = map_champion_names_to_ids(champion_names)

print(champion_ids)

[897, 113, 161, 22, 80, 24, 62, 61, 498, 40]


In [7]:
def get_tournament_name(soup: BeautifulSoup) -> Optional[str]:
    """
    Extract the tournament name from the parsed HTML.

    Args:
        soup: BeautifulSoup object containing the parsed HTML

    Returns:
        Tournament name as a string or None if not found
    """
    # Find the anchor tag with the specific href pattern
    tournament_anchor = soup.find(
        "a", href=lambda href: href and "../tournament/tournament-stats" in href
    )

    if tournament_anchor:
        return tournament_anchor.text.strip()

    return None


# Example usage:
tournament_name = get_tournament_name(soup)
print(tournament_name)

NLC 2025 Winter Playoffs


## timeline scraping


In [8]:
from typing import Tuple, List, Dict
import requests
from bs4 import BeautifulSoup
import re
import json


def get_timeline_soup(game_id: str) -> BeautifulSoup:
    """
    Fetches the timeline page for a specific game and returns its BeautifulSoup object.

    Args:
        game_id: The ID of the game to fetch

    Returns:
        BeautifulSoup object of the timeline page
    """
    url = f"https://gol.gg/game/stats/{game_id}/page-timeline/"
    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
    headers = {"User-Agent": user_agent}

    # Send HTTP request
    # Send HTTP request
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for 4XX/5XX responses
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    return soup


def js_to_json(js_str: str) -> str:
    """
    Convert JavaScript object notation to valid JSON.

    Args:
        js_str: JavaScript object string

    Returns:
        Valid JSON string
    """
    # Add quotes to property names
    js_str = re.sub(r"([{,])\s*(\w+):", r'\1"\2":', js_str)

    # Replace single quotes with double quotes, but only for strings
    # First, temporarily replace any escaped single quotes
    js_str = js_str.replace("\\'", "___ESCAPED_QUOTE___")

    # Then handle the actual quotes
    in_string = False
    result = []
    for char in js_str:
        if char == "'" and not in_string:
            result.append('"')
            in_string = True
        elif char == "'" and in_string:
            result.append('"')
            in_string = False
        else:
            result.append(char)
    js_str = "".join(result)

    # Restore escaped quotes
    js_str = js_str.replace("___ESCAPED_QUOTE___", "\\'")

    # Remove trailing commas in arrays and objects
    js_str = re.sub(r",(\s*[}\]])", r"\1", js_str)

    return js_str


def create_empty_timeline_data() -> Dict[str, Optional[float]]:
    """Helper function to create a dictionary with None values for all timeline metrics"""
    result = {}
    timestamps = ["900000", "1200000"]
    for team_id in ["100", "200"]:
        for position in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]:
            for timestamp in timestamps:
                result[f"team_{team_id}_{position}_totalGold_at_{timestamp}"] = None
                result[f"team_{team_id}_{position}_creepScore_at_{timestamp}"] = None
    return result


def extract_timeline_data(soup: BeautifulSoup) -> Dict[str, Optional[float]]:
    """
    Extracts gold and CS data from the timeline page.
    Returns None values for missing data instead of raising exceptions.
    """
    try:
        # Find the script containing the data
        script_content = soup.find("script", string=lambda t: t and "golddatas" in t)
        if not script_content:
            print("Timeline data not found in page")
            return create_empty_timeline_data()

        # Extract the data objects using regex
        gold_match = re.search(
            r"var\s+golddatas\s*=\s*({.*?});", script_content.string, re.DOTALL
        )
        cs_match = re.search(
            r"var\s+csdatas\s*=\s*({.*?});", script_content.string, re.DOTALL
        )

        if not gold_match or not cs_match:
            print("Could not parse gold or CS data")
            return create_empty_timeline_data()

        # Parse the JSON-like strings
        # Replace single quotes with double quotes for valid JSON
        # Convert JavaScript objects to valid JSON
        json_gold = js_to_json(gold_match.group(1))
        json_cs = js_to_json(cs_match.group(1))
        gold_data = json.loads(json_gold)
        cs_data = json.loads(json_cs)

        # Initialize result dictionary
        result = {}

        # Define positions mapping (gol.gg uses different names)
        position_mapping = {
            "TOP": "TOP",
            "JGL": "JUNGLE",
            "MID": "MIDDLE",
            "BOT": "BOTTOM",
            "SPT": "UTILITY",
        }

        # Extract data for minutes 15 (index 15) and 20 (index 20)
        timestamps = ["900000", "1200000"]  # 15 min and 20 min
        time_indices = [15, 20]

        # First 5 datasets are team 100, last 5 are team 200
        for team_idx, team_id in enumerate(["100", "200"]):
            base_offset = team_idx * 5
            for dataset_idx, dataset in enumerate(
                gold_data["datasets"][base_offset : base_offset + 5]
            ):
                position = position_mapping[dataset["label"]]

                # Get gold data
                for time_idx, timestamp in zip(time_indices, timestamps):
                    gold_value = dataset["data"][time_idx]
                    result[f"team_{team_id}_{position}_totalGold_at_{timestamp}"] = (
                        gold_value
                    )

                # Get CS data
                cs_dataset = cs_data["datasets"][base_offset + dataset_idx]
                for time_idx, timestamp in zip(time_indices, timestamps):
                    cs_value = cs_dataset["data"][time_idx]
                    result[f"team_{team_id}_{position}_creepScore_at_{timestamp}"] = (
                        cs_value
                    )

        return result
    except Exception as e:
        print(f"Error extracting timeline data: {e}")
        return create_empty_timeline_data()

In [9]:
soup = get_timeline_soup(64750)
timeline_data = extract_timeline_data(soup)
print(timeline_data)

{'team_100_TOP_totalGold_at_900000': 4613, 'team_100_TOP_totalGold_at_1200000': 6870, 'team_100_TOP_creepScore_at_900000': 109, 'team_100_TOP_creepScore_at_1200000': 142, 'team_100_JUNGLE_totalGold_at_900000': 4891, 'team_100_JUNGLE_totalGold_at_1200000': 6235, 'team_100_JUNGLE_creepScore_at_900000': 103, 'team_100_JUNGLE_creepScore_at_1200000': 122, 'team_100_MIDDLE_totalGold_at_900000': 5976, 'team_100_MIDDLE_totalGold_at_1200000': 8042, 'team_100_MIDDLE_creepScore_at_900000': 147, 'team_100_MIDDLE_creepScore_at_1200000': 205, 'team_100_BOTTOM_totalGold_at_900000': 6027, 'team_100_BOTTOM_totalGold_at_1200000': 8126, 'team_100_BOTTOM_creepScore_at_900000': 134, 'team_100_BOTTOM_creepScore_at_1200000': 183, 'team_100_UTILITY_totalGold_at_900000': 3729, 'team_100_UTILITY_totalGold_at_1200000': 4974, 'team_100_UTILITY_creepScore_at_900000': 28, 'team_100_UTILITY_creepScore_at_1200000': 29, 'team_200_TOP_totalGold_at_900000': 4954, 'team_200_TOP_totalGold_at_1200000': 6733, 'team_200_TOP_

In [10]:
def create_empty_kda_data() -> Dict[str, Optional[int]]:
    """Helper function to create a dictionary with None values for all KDA metrics"""
    stats = {}
    for team_id in ["100", "200"]:
        for position in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]:
            for stat in ["kills", "deaths", "assists"]:
                for timestamp in ["900000", "1200000"]:
                    stats[f"team_{team_id}_{position}_{stat}_at_{timestamp}"] = None
    return stats


def parse_timeline_kda(
    soup: BeautifulSoup, champion_positions: Dict[str, str]
) -> Dict[str, int]:
    """
    Extract KDA stats from timeline events.

    Args:
        soup: BeautifulSoup object of the timeline page
        champion_positions: Dictionary mapping champion names to their positions
            e.g. {"MissFortune": "BOTTOM", "Maokai": "JUNGLE", ...}

    Returns:
        Dictionary containing KDA stats for each position and team at 15 and 20 minutes
        Format: {
            'team_100_TOP_kills_at_900000': value,
            'team_100_TOP_deaths_at_900000': value,
            'team_100_TOP_assists_at_900000': value,
            ...
        }
    """
    # Initialize KDA counters for each position and team
    stats = {}
    for team_id in ["100", "200"]:
        for position in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]:
            for stat in ["kills", "deaths", "assists"]:
                for timestamp in ["900000", "1200000"]:  # 15 min and 20 min
                    stats[f"team_{team_id}_{position}_{stat}_at_{timestamp}"] = 0

    try:
        events_table = soup.find("table", {"class": "timeline"})
        if not events_table:
            print("Could not find timeline events table")
            return create_empty_kda_data()

        # Current KDA counts
        current_kda = {
            "100": {
                pos: {"kills": 0, "deaths": 0, "assists": 0}
                for pos in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]
            },
            "200": {
                pos: {"kills": 0, "deaths": 0, "assists": 0}
                for pos in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]
            },
        }

        # Process each event row
        for row in events_table.find_all("tr")[1:]:  # Skip header row
            cols = row.find_all("td")
            if len(cols) < 5:
                continue

            # Parse timestamp
            time_str = cols[0].text.strip()
            if not time_str:
                continue

            minutes, seconds = map(int, time_str.split(":"))
            event_time = minutes * 60 + seconds

            # Check if it's a kill event
            kill_icon = cols[4].find("img", {"src": "../_img/kill-icon.png"})
            if not kill_icon:
                continue

            # Get killer team (blue = 100, red = 200)
            team_side = cols[1].find("img")["src"]
            team_id = "100" if "blueside" in team_side else "200"

            # Get killer champion and their position
            # Updated selector to match height:25px in style attribute
            killer_champ = cols[3].find(
                "img",
                {
                    "src": lambda x: x and "champions_icon" in x,
                    "style": lambda x: x and "height:25px" in x,
                },
            )
            if not killer_champ:
                continue

            killer_name = (
                killer_champ["src"].split("/")[-1].replace(".png", "")
            )  # Extract name from src
            # Handle special cases in champion names
            killer_name = (
                killer_name.replace("_", " ")
                .replace("KSante", "K'Sante")
                .replace("KaiSa", "Kai'Sa")
                .replace("KhaZix", "Kha'Zix")
                .replace("RekSai", "Rek'Sai")
                .replace("VelKoz", "Vel'Koz")
            )
            killer_position = champion_positions.get(killer_name)

            # Get assist champions and their positions
            assist_champs = cols[3].find_all(
                "img",
                {
                    "src": lambda x: x and "champions_icon" in x,
                    "style": lambda x: x and "height:18px" in x,
                },
            )
            assist_positions = []
            for champ in assist_champs:
                assist_name = champ["src"].split("/")[-1].replace(".png", "")
                # Handle special cases in champion names
                assist_name = (
                    assist_name.replace("_", " ")
                    .replace("KSante", "K'Sante")
                    .replace("KaiSa", "Kai'Sa")
                    .replace("KhaZix", "Kha'Zix")
                    .replace("RekSai", "Rek'Sai")
                    .replace("VelKoz", "Vel'Koz")
                )
                assist_position = champion_positions.get(assist_name)
                if assist_position:
                    assist_positions.append(assist_position)

            # Get target champion and position
            target_champ = cols[5].find(
                "img", {"src": lambda x: x and "champions_icon" in x}
            )
            if not target_champ:
                continue

            target_name = target_champ["src"].split("/")[-1].replace(".png", "")
            # Handle special cases in champion names
            target_name = (
                target_name.replace("_", " ")
                .replace("KSante", "K'Sante")
                .replace("KaiSa", "Kai'Sa")
                .replace("KhaZix", "Kha'Zix")
                .replace("RekSai", "Rek'Sai")
                .replace("VelKoz", "Vel'Koz")
            )
            target_position = champion_positions.get(target_name)
            target_team = "200" if team_id == "100" else "100"

            # Update KDA counts
            if killer_position:
                current_kda[team_id][killer_position]["kills"] += 1
            if target_position:
                current_kda[target_team][target_position]["deaths"] += 1
            for assist_pos in assist_positions:
                if assist_pos:
                    current_kda[team_id][assist_pos]["assists"] += 1

            # Update stats at 15 min (900000) if we just passed it
            if event_time <= 900 and (minutes == 15 or minutes == 14):
                for t_id in ["100", "200"]:
                    for pos in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]:
                        for stat in ["kills", "deaths", "assists"]:
                            stats[f"team_{t_id}_{pos}_{stat}_at_900000"] = current_kda[
                                t_id
                            ][pos][stat]

            # Update stats at 20 min (1200000) if we just passed it
            if event_time <= 1200 and (minutes == 20 or minutes == 19):
                for t_id in ["100", "200"]:
                    for pos in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]:
                        for stat in ["kills", "deaths", "assists"]:
                            stats[f"team_{t_id}_{pos}_{stat}_at_1200000"] = current_kda[
                                t_id
                            ][pos][stat]

        # If we never hit 15 or 20 minutes, use the final counts
        for timestamp in ["900000", "1200000"]:
            for t_id in ["100", "200"]:
                for pos in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]:
                    for stat in ["kills", "deaths", "assists"]:
                        key = f"team_{t_id}_{pos}_{stat}_at_{timestamp}"
                        if stats[key] == 0:
                            stats[key] = current_kda[t_id][pos][stat]

        return stats
    except Exception as e:
        print(f"Error parsing timeline KDA: {e}")
        return create_empty_kda_data()

In [11]:
champion_names

['KSante',
 'Sejuani',
 'VelKoz',
 'Ashe',
 'Pantheon',
 'Jax',
 'Wukong',
 'Orianna',
 'Xayah',
 'Janna']

In [12]:
champion_positions = {}
for idx, position in enumerate(["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"] * 2):
    champion_positions[champion_names[idx]] = position
print(champion_positions)

{'KSante': 'TOP', 'Sejuani': 'JUNGLE', 'VelKoz': 'MIDDLE', 'Ashe': 'BOTTOM', 'Pantheon': 'UTILITY', 'Jax': 'TOP', 'Wukong': 'JUNGLE', 'Orianna': 'MIDDLE', 'Xayah': 'BOTTOM', 'Janna': 'UTILITY'}


In [13]:
kda_stats = parse_timeline_kda(soup, champion_positions)
print(kda_stats)

{'team_100_TOP_kills_at_900000': 0, 'team_100_TOP_kills_at_1200000': 0, 'team_100_TOP_deaths_at_900000': 0, 'team_100_TOP_deaths_at_1200000': 0, 'team_100_TOP_assists_at_900000': 0, 'team_100_TOP_assists_at_1200000': 0, 'team_100_JUNGLE_kills_at_900000': 0, 'team_100_JUNGLE_kills_at_1200000': 0, 'team_100_JUNGLE_deaths_at_900000': 1, 'team_100_JUNGLE_deaths_at_1200000': 1, 'team_100_JUNGLE_assists_at_900000': 3, 'team_100_JUNGLE_assists_at_1200000': 6, 'team_100_MIDDLE_kills_at_900000': 0, 'team_100_MIDDLE_kills_at_1200000': 0, 'team_100_MIDDLE_deaths_at_900000': 0, 'team_100_MIDDLE_deaths_at_1200000': 0, 'team_100_MIDDLE_assists_at_900000': 0, 'team_100_MIDDLE_assists_at_1200000': 0, 'team_100_BOTTOM_kills_at_900000': 2, 'team_100_BOTTOM_kills_at_1200000': 2, 'team_100_BOTTOM_deaths_at_900000': 1, 'team_100_BOTTOM_deaths_at_1200000': 1, 'team_100_BOTTOM_assists_at_900000': 15, 'team_100_BOTTOM_assists_at_1200000': 3, 'team_100_UTILITY_kills_at_900000': 9, 'team_100_UTILITY_kills_at_12

# Full scraping


In [14]:
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Optional, Tuple
import random
from tqdm import tqdm  # For progress bar


def get_game_data(game_id: int) -> Optional[Dict[str, Any]]:
    """
    Get complete game data including timeline stats for a specific game ID.

    Args:
        game_id: The game ID to fetch data for

    Returns:
        Dictionary containing all game data or None if fetching failed
    """
    # Get main game page data
    main_soup = get_golgg_game_stats(game_id)
    if main_soup is None:
        return None

    try:
        # Extract basic game data
        patch = get_game_patch(main_soup)
        if not patch:
            return None

        major_patch, minor_patch = parse_game_version(patch)
        duration_str = get_game_duration(main_soup)
        duration_seconds = parse_game_duration(duration_str) if duration_str else 0
        team_info = get_team_info(main_soup)
        champion_names = get_champion_names(main_soup)
        tournament_name = get_tournament_name(main_soup)

        # Map champion names to IDs and positions
        champion_ids = map_champion_names_to_ids(champion_names)
        champion_positions = {}
        for idx, position in enumerate(
            ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"] * 2
        ):
            champion_positions[champion_names[idx]] = position

        # Get timeline data
        timeline_soup = get_timeline_soup(game_id)
        if timeline_soup is None:
            return None

        # Extract timeline stats
        timeline_gold_cs = extract_timeline_data(timeline_soup)
        timeline_kda = parse_timeline_kda(timeline_soup, champion_positions)

        # Combine all data
        game_data = {
            "golgg_id": game_id,
            "champion_ids": champion_ids,
            "gameVersionMajorPatch": major_patch,
            "gameVersionMinorPatch": minor_patch,
            "gameDuration": duration_seconds,
            "blueTeamName": team_info.get("blue_team", ""),
            "redTeamName": team_info.get("red_team", ""),
            "tournamentName": tournament_name or "",
            "team_100_win": team_info.get("blue_won", None),
        }

        # Add timeline stats
        game_data.update(timeline_gold_cs)
        game_data.update(timeline_kda)

        return game_data

    except Exception as e:
        print(f"Error processing game ID {game_id}: {e}")
        return None


def scrape_golgg_games(
    start_game_id: int,
    output_file_path: str,
    min_major_version: int = 14,
    request_delay: Tuple[float, float] = (1.0, 3.0),
    limit: int = None,
) -> pd.DataFrame:
    """
    Scrape game data from gol.gg starting from a specific game ID and working backwards.
    Save the results to a parquet file.

    Args:
        start_game_id: The game ID to start scraping from
        output_file_path: Path where the parquet file will be saved
        min_major_version: Minimum game major version to scrape (default: 14)
        request_delay: Tuple of (min, max) seconds to delay between requests

    Returns:
        DataFrame containing all scraped game data
    """
    # Check if the parquet file already exists and load it
    if os.path.exists(output_file_path):
        existing_df = pd.read_parquet(output_file_path)
        print(f"Loaded existing data with {len(existing_df)} games")

        # Get the set of game IDs that we already have
        existing_game_ids = set(existing_df["golgg_id"].tolist())
    else:
        existing_df = pd.DataFrame(
            columns=[
                "golgg_id",
                "champion_ids",
                "gameVersionMajorPatch",
                "gameVersionMinorPatch",
                "gameDuration",
                "blueTeamName",
                "redTeamName",
                "tournamentName",
                "team_100_win",
            ]
        )
        existing_game_ids = set()
        print("No existing data found, creating new dataset")

    # Create a list to store new game data
    new_games_data = []

    # Initialize the current game ID
    current_id = start_game_id

    try:
        with tqdm(desc="Scraping games") as pbar:
            while current_id > 0:
                if current_id in existing_game_ids:
                    print(f"Game ID {current_id} already exists in dataset. Stopping.")
                    break

                time.sleep(random.uniform(request_delay[0], request_delay[1]))

                # Use the new get_game_data function
                game_data = get_game_data(current_id)

                if game_data is None:
                    print(
                        f"Game ID {current_id} not found or error occurred. Skipping."
                    )
                    current_id -= 1
                    continue

                # Check version
                if game_data["gameVersionMajorPatch"] < min_major_version:
                    print(
                        f"Reached game with version below minimum {min_major_version}. Stopping."
                    )
                    break

                new_games_data.append(game_data)

                # Every 100 games, save progress
                if len(new_games_data) % 100 == 0:
                    # Combine existing data with new data
                    combined_df = pd.concat(
                        [existing_df, pd.DataFrame(new_games_data)],
                        ignore_index=True,
                    )

                    # Save to parquet
                    combined_df.to_parquet(output_file_path, index=False)
                    print(f"Saved progress: {len(combined_df)} total games")

                # Update progress bar
                pbar.update(1)
                pbar.set_postfix(
                    {
                        "game_id": current_id,
                        "version": f"v{game_data['gameVersionMajorPatch']}.{game_data["gameVersionMinorPatch"]}",
                    }
                )
                # Move to the previous game ID
                current_id -= 1

    except KeyboardInterrupt:
        print("\nScraping interrupted by user")

    finally:
        # Save final results if we have new data
        if new_games_data:
            # Combine existing data with new data
            final_df = pd.concat(
                [existing_df, pd.DataFrame(new_games_data)], ignore_index=True
            )

            # Save to parquet
            final_df.to_parquet(output_file_path, index=False)
            print(f"Final dataset saved with {len(final_df)} games")
            return final_df
        else:
            print("No new data collected")
            return existing_df

In [None]:
from utils.match_prediction import RAW_PRO_GAMES_DIR

# Define starting game ID (latest game from website)
latest_game_id = 68139  # Replace with the latest game ID you find

output_file_path = os.path.join(RAW_PRO_GAMES_DIR, "pro_games.parquet")

# Run the scraper
pro_games_df = scrape_golgg_games(
    start_game_id=latest_game_id,
    output_file_path=output_file_path,
    min_major_version=14,
    request_delay=(
        0.5,
        1.0,
    ),  # Random delay between 1-3 seconds to be respectful to the server
)

# Display summary of collected data
print(f"Total games collected: {len(pro_games_df)}")
print(f"Unique tournaments: {pro_games_df['tournamentName'].nunique()}")
print(f"Games by major version:")
print(pro_games_df.groupby("gameVersionMajorPatch").size())

In [None]:
pro_games_df.head()