In [1]:
!pip install selenium



In [2]:
!pip install webdriver-manager



In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd

def get_role_data(role="TOP"):
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get("https://gol.gg/players/list/season-S15/split-ALL/tournament-LCK%20Spring%202024/")

    extracted_data = {}

    try:
        # 2) Locate the hidden input and set it to 'role'
        hidden_role = driver.find_element(By.ID, "hiddenfieldrole")
        driver.execute_script("arguments[0].value = arguments[1];", hidden_role, role)

        # 3) Locate and submit the form
        form = driver.find_element(By.ID, "FilterForm")
        form.submit()

        # 4) Wait for the new page to load
        time.sleep(5)  # or use an explicit wait if needed

        # 5) Parse the updated page with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 6) Locate the updated table
        table = soup.select_one("table.table_list.playerslist.tablesaw.trhover.tablesaw-swipe.tablesaw-sortable")
        if table:
            # Extract rows
            rows = table.find_all('tr')
            table_data = []
            for row in rows:
                cells = row.find_all(['td', 'th'])
                cell_texts = [cell.get_text(strip=True) for cell in cells]
                if cell_texts:
                    table_data.append(cell_texts)

            # Build a dict-of-dicts from the table
            if len(table_data) > 1:
                headers = table_data[0]  # first row: column names
                for i in range(1, len(table_data)):
                    row_key = table_data[i][0]  # e.g. player name
                    row_dict = {}
                    for j in range(1, len(headers)):
                        if j < len(table_data[i]):
                            row_dict[headers[j]] = table_data[i][j]
                    extracted_data[row_key] = row_dict
            else:
                print("No valid data rows found for role:", role)
        else:
            print("Table not found for role:", role)

    finally:
        # 7) Close the browser
        driver.quit()

    return extracted_data

In [4]:
top_data = get_role_data(role="TOP")
jg_data = get_role_data(role="JUNGLE")
mid_data = get_role_data(role="MID")
support_data = get_role_data(role="SUPPORT")
bot_data = get_role_data(role="ADC")

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Convert dictionary into DataFrame
# add a 'Position' column
dfs = []
roles = [("TOP", top_data), ("JUNGLE", jg_data), ("MID", mid_data), ("SUPPORT", support_data), ("ADC", bot_data)]
for role, data in roles:
    df_role = pd.DataFrame.from_dict(data, orient='index')
    df_role['Position'] = role
    dfs.append(df_role)
    

df = pd.concat(dfs)

# Convert percentage to float values
def percentage_to_float(x):
    if isinstance(x, str):
        try:
            return float(x.strip().strip('%')) / 100.0
        except:
            return np.nan
    return x
for col in ['Win rate', 'KP%', 'FB %', 'FB Victim']:
    if col in df.columns:
        df[col] = df[col].apply(percentage_to_float)

# Data cleaning: drop NaN, missing value, country
numeric_columns = ['Games', 'KDA', 'Avg kills', 'Avg deaths', 'Avg assists',
                   'CSM', 'GPM', 'DPM', 'VSPM', 'Avg WPM', 'Avg WCPM', 'Avg VWPM',
                   'GD@15', 'CSD@15', 'XPD@15', 'Penta Kills', 'Solo Kills']
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '').str.strip(), errors='coerce')
required_cols = numeric_columns + ['Win rate']
df = df.dropna(subset=required_cols)
if 'Country' in df.columns:
    df = df.drop(columns=['Country'])
df = pd.get_dummies(df, columns=['Position'], drop_first=True)

# Define features and target variable
features = numeric_columns + [col for col in df.columns if col.startswith("Position_")]
X = df[features]
y = df['Win rate']

# Split the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
print(model.score(X_test, y_test))

# Print out the weights for each feature.
print("Feature weights:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef:.4f}")

0.864520053090331
Feature weights:
Games: 0.0021
KDA: 0.0515
Avg kills: -0.0621
Avg deaths: 0.0177
Avg assists: 0.0441
CSM: -0.1622
GPM: 0.0075
DPM: 0.0001
VSPM: -0.1179
Avg WPM: 0.3255
Avg WCPM: 0.2103
Avg VWPM: -0.1152
GD@15: -0.0000
CSD@15: 0.0035
XPD@15: -0.0001
Penta Kills: 0.0066
Solo Kills: 0.0039
Position_JUNGLE: 0.2165
Position_MID: 0.1462
Position_SUPPORT: -0.2863
Position_TOP: 0.2690
