In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
from functools import lru_cache
from urllib.parse import urljoin

BASE_URL = "https://gol.gg"
TOURNAMENT_URL = "https://gol.gg/tournament/tournament-matchlist/LEC%20Winter%202025/"

@lru_cache(maxsize=200)
def get_champion_name(champion_url: str) -> str:
    """
    Fetch the champion page and parse out its <h1> text.
    """
    print(f"[DEBUG] Fetching champion page: {champion_url}")
    time.sleep(0.5)  # be polite
    resp = requests.get(champion_url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(resp.text, "html.parser")
    
    h1 = soup.find("h1")
    if not h1:
        print("[DEBUG] Could not find <h1> on champion page.")
        return "Unknown"
    
    champion_name = h1.get_text(strip=True)
    print(f"[DEBUG] Champion name found: {champion_name}")
    return champion_name

def get_picked_champions(side_div) -> list[str]:
    """
    Find the row where <div class="col-2"> contains the text "Picks".
    Then fetch all <a> elements in the sibling <div class="col-10">,
    and parse champion names from those links.
    """
    picks_row = None
    
    # Find the row that actually contains the text "Picks" in a col-2
    for row in side_div.find_all("div", class_="row"):
        label_div = row.find("div", class_="col-2")
        if label_div and label_div.get_text(strip=True) == "Picks":
            picks_row = row
            break
    
    if not picks_row:
        print("[DEBUG] No picks row found.")
        return []
    
    picks_div = picks_row.find("div", class_="col-10")
    if not picks_div:
        print("[DEBUG] No col-10 div next to 'Picks'.")
        return []
    
    champion_names = []
    
    # Collect all <a> tags that lead to "/champion/champion-stats/...".
    # Some might not have `class="black_link"`, so we ignore the class.
    anchor_tags = picks_div.find_all("a", href=re.compile(r"/champion/champion-stats/"))
    
    for link in anchor_tags:
        relative_url = link.get("href", "")
        full_champ_url = urljoin(BASE_URL, relative_url.replace("..", ""))
        champion_names.append(get_champion_name(full_champ_url))
    
    return champion_names

def scrape_lec_winter_2025_test(limit: int = 1):
    """
    Scrape the first `limit` matches from the LEC Winter 2025 page on gol.gg
    and return a list of dictionaries. This version focuses on debugging champion picks.
    """
    print(f"[DEBUG] Fetching tournament match list: {TOURNAMENT_URL}")
    resp = requests.get(TOURNAMENT_URL, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(resp.text, "html.parser")
    
    rows = soup.select("table.table_list tbody tr")
    print(f"[DEBUG] Found {len(rows)} rows in the match list.")
    
    scraped_data = []
    count = 0
    
    for row in rows:
        if count >= limit:
            break
        
        cells = row.find_all("td")
        if len(cells) < 7:
            continue
        
        patch = cells[5].get_text(strip=True)
        
        # Link to the detailed game page
        game_link_rel = cells[0].find("a")["href"]  # e.g. "../game/stats/62978/page-game/"
        game_url = urljoin(BASE_URL, game_link_rel.replace("..", ""))
        
        print(f"\n[DEBUG] Scraping game page: {game_url}")
        time.sleep(0.5)
        game_resp = requests.get(game_url, headers={"User-Agent": "Mozilla/5.0"})
        game_soup = BeautifulSoup(game_resp.text, "html.parser")
        
        sides = game_soup.select("div.col-12.col-sm-6")
        if len(sides) < 2:
            print("[DEBUG] Could not find both blue and red side divs.")
            continue
        
        # -- Blue team info
        blue_header = sides[0].select_one(".blue-line-header")
        blue_team_name = "UnknownBlue"
        blue_win = False
        if blue_header:
            header_text = blue_header.get_text(strip=True)
            # Usually looks like: "Team Heretics - LOSS" or "... - WIN"
            if "- WIN" in header_text:
                blue_win = True
                blue_team_name = header_text.replace("- WIN", "").strip()
            elif "- LOSS" in header_text:
                blue_team_name = header_text.replace("- LOSS", "").strip()
        
        # -- Red team info
        red_header = sides[1].select_one(".red-line-header")
        red_team_name = "UnknownRed"
        if red_header:
            header_text = red_header.get_text(strip=True)
            if "- WIN" in header_text:
                red_team_name = header_text.replace("- WIN", "").strip()
            elif "- LOSS" in header_text:
                red_team_name = header_text.replace("- LOSS", "").strip()
        
        # -- Gather picks
        blue_champs = get_picked_champions(sides[0])
        red_champs = get_picked_champions(sides[1])
        
        print(f"[DEBUG] Blue champs: {blue_champs}")
        print(f"[DEBUG] Red champs: {red_champs}")
        
        game_data = {
            "patch": patch,
            "blueTeamName": blue_team_name,
            "redTeamName":  red_team_name,
            "blueTeamWin":  blue_win,
            "blueChamps":   blue_champs,
            "redChamps":    red_champs
        }
        
        scraped_data.append(game_data)
        count += 1
    
    return scraped_data

if __name__ == "__main__":
    results = scrape_lec_winter_2025_test(limit=100)
    print("\n[DEBUG] Final scraped data:")
    for item in results:
        print(item)


In [None]:
import csv
import requests
import time
from typing import List, Dict
from utils.rl.champions import Champion  # your champion Enum
from fuzzywuzzy import process  # or from rapidfuzz import process

# Pre-build a lookup dict for exact matches: {"Ornn": Champion.ORNN, "Gnar": Champion.GNAR, ...}
# Key = champion.display_name with punctuation removed / standardized (e.g. "KSante" -> "KSante", "K'Sante" -> "KSante")
lookup_dict = {}


# We'll create a function that normalizes strings (remove apostrophes, etc.)
def normalize_name(name: str) -> str:
    # Example: "K'Sante" -> "KSante", "Wukong" -> "Wukong"
    # Just remove apostrophes/spaces. Adjust as needed.
    return name.replace("'", "").replace(" ", "").lower()


for champ in Champion:
    norm_display = normalize_name(champ.display_name)
    lookup_dict[norm_display] = champ


def map_champion_name_to_id(website_name: str) -> int:
    """
    Try to map champion name from Gol.GG (e.g. "KSante") to the official champion id
    in the `Champion` enum. Uses exact match if possible, otherwise fuzzy matching.
    """
    normalized = normalize_name(website_name)
    if normalized in lookup_dict:
        return lookup_dict[normalized].id

    # Otherwise, do a fuzzy match across known keys
    # This helps if the site name is slightly off, e.g. "Ksante" vs "KSante"
    best_match, score = process.extractOne(normalized, list(lookup_dict.keys()))
    if score > 50:  # arbitrary cutoff
        return lookup_dict[best_match].id

    # If still no match, return -1 or some sentinel:
    return -1  # "UNKNOWN"


def predict_blue_win_probability(
    blue_champs: List[str],
    red_champs: List[str],
    patch: str,
    api_key: str = "YOUR_API_KEY",
) -> float:
    """
    Calls the /predict endpoint with the champion IDs and patch.
    Returns the predicted probability that the blue side wins (0..1).
    """
    # Convert champion names -> champion IDs
    # Gol.GG lists champions in an order: Blue side [champ1..5], Red side [champ6..10].
    # The model expects champion_ids in a single list: [blue1,blue2,blue3,blue4,blue5, red1,red2,red3,red4,red5]
    champion_ids = []
    for cname in blue_champs:
        champion_ids.append(map_champion_name_to_id(cname))
    for cname in red_champs:
        champion_ids.append(map_champion_name_to_id(cname))

    payload = {
        "champion_ids": champion_ids,
        "numerical_elo":  0, # 0 for master +
        "patch": patch,
    }
    print(f"payload: {payload}")
    headers = {"Content-Type": "application/json", "X-API-Key": api_key}

    try:
        resp = requests.post(
            "https://loldraftai.com/api/predict",
            json=payload,
            headers=headers,
            timeout=10,
        )
        resp.raise_for_status()
        data = resp.json()
        # data is shape: {"win_probability": 0.62, ...}
        return data["win_probability"]
    except Exception as e:
        print(f"Error calling model API: {e}")
        return 0.5  # fallback: 50% if error


def create_csv(matches: List[Dict], output_csv: str = "matches_predictions.csv"):
    """
    `matches` is a list of dicts like:
    {
      "patch": "15.2",
      "blueTeamName": "SK Gaming",
      "redTeamName": "Fnatic",
      "blueTeamWin": False,
      "blueChamps": ["Ornn", "Diana", "Corki", "Varus", "Poppy"],
      "redChamps": ["Gnar", "Zyra", "Yone", "Ezreal", "Rell"]
    }

    This writes out a CSV with the final predictions.
    """
    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        fieldnames = [
            "patch",
            "blueTeamName",
            "redTeamName",
            "blueChamps",
            "redChamps",
            "predictedWinPct",  # model's percentage for Blue side
            "modelWasCorrect",
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for match in matches:
            patch = match["patch"]
            blueTeamName = match["blueTeamName"]
            redTeamName = match["redTeamName"]
            blueTeamWin = match["blueTeamWin"]  # True or False
            blueChamps = match["blueChamps"]
            redChamps = match["redChamps"]

            # Call our function to get the predicted probability
            win_prob = predict_blue_win_probability(blueChamps, redChamps, patch)
            print(f"win_prob: {win_prob}")

            # Convert to percentage
            predicted_pct = round(win_prob * 100, 2)

            # The model is "correct" if it predicted > 50% for the team that actually won
            # (i.e., if blueTeamWin == True, we want win_prob >= 0.5)
            # if blueTeamWin == False, we want win_prob < 0.5
            predicted_winner_is_blue = win_prob >= 0.5
            model_correct = predicted_winner_is_blue == blueTeamWin

            # Save row
            writer.writerow(
                {
                    "patch": patch,
                    "blueTeamName": blueTeamName,
                    "redTeamName": redTeamName,
                    "blueChamps": "|".join(blueChamps),  # or a comma, your choice
                    "redChamps": "|".join(redChamps),
                    "predictedWinPct": predicted_pct,
                    "modelWasCorrect": model_correct,
                }
            )

            # Optional: just being polite to your API
            time.sleep(0.3)

    print(f"Done. Predictions written to {output_csv}")


create_csv(results, output_csv="matches_predictions.csv")