# Scratch Code
### Retrieve Site and Parse HTML

In [None]:
#%pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

options = webdriver.ChromeOptions()

options.add_argument('--headless')
options.add_argument("--incognito")
options.add_argument("--nogpu")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1280,1280")
options.add_argument("--no-sandbox")
options.add_argument("--enable-javascript")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')

ua = UserAgent()
userAgent = ua.random

driver = webdriver.Chrome(options=options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})

driver.get('https://mykbostats.com/players/2001')

soup = BeautifulSoup(driver.page_source,"html.parser")


driver.quit()

### Player Name and Team

In [None]:
t = soup.title.text
player = t[:t.find(' KBO')]
#role = [n['name'] for n in soup.find_all('tbody', class_='syncscroll', limit=2)]

teams = {'Doosan': 'Doosan Bears', 
         'Hanwha': 'Hanwha Eagles', 
         'Kia': 'Kia Tigers', 
         'Kiwoom': 'Kiwoom Heroes', 
         'KT': 'KT Wiz', 
         'LG': 'LG Twins', 
         'Lotte': 'Lotte Giants', 
         'NC': 'NC Dinos', 
         'Samsung': 'Samsung Lions', 
         'SSG': 'SSG Landers'}



### Table Headers

In [None]:
#Replace header with this
#might also be good for checking role of player

header={"Pitcher":None, "Batter":None}

for h in soup.find_all('thead'):
  match h['name'] :
    case 'pitching':
      header['Pitcher'] = (h.get_text().split())
    case 'batting':
      header['Batter'] = (h.get_text().split())


### Get Table Contents 
Write to contents to dataframe, then write to CSV. 

In [None]:
import pandas as pd
from unicodedata import numeric


#to handle unicode data in table
def uni_to_num (unicode):
    if ('(' in unicode):
       return unicode
    elif len(unicode) == 0:
        return None
    elif len(unicode) == 1:
        num = numeric(unicode)
    elif unicode[-1].isdigit():
        # normal number, ending in [0-9]
        num = float(unicode)
    else:
        # Assume the last character is a vulgar fraction
        num = float(unicode[:-1]) + numeric(unicode[-1])
    return num

pitchdata = None
batdata = None

pitchtemp = []
battemp = []

# go through all the tables in the page
for r in soup.find_all('table'):
  # ignore the pitching-games and batting-games tables
  if ('games' in r.thead['name']):
    break  

  # Go through the table bodies
  for tb in r.select('tbody tr'):
    # skip the empty or career tables
    if (tb.select_one('.left') == None or tb.select_one('.left').text.strip() == 'Career'):
      continue

    #Extracting table information
    # make temp list for each row, init with player name, and first two column entries
    t = [player, tb.select('td')[0].text, tb.select('td')[1].text.strip()]
    # go through rest of the row
    for i in tb.select('td')[2:]:
      if (i.text == ''):
        t.append(None)
      else:
        t.append(uni_to_num(i.text.strip()))

    # sort data accordingly
    if (r.thead['name'] == 'pitching'):
      pitchtemp.append(t)
    if (r.thead['name']=='batting'):
      battemp.append(t)

  # make dataframes
  headers = ['Name'] + r.thead.get_text().split()
  if (r.thead['name']=='pitching'):
     pitchdata = pd.DataFrame(data=pitchtemp, columns=headers)
  if (r.thead['name']=='batting'):
     batdata = pd.DataFrame(data=battemp, columns=headers)



print(pitchdata)
print(batdata)

# Python Script

In [149]:
#%pip install ...
import pandas as pd
from fake_useragent import UserAgent
from selenium import webdriver
from bs4 import BeautifulSoup
from unicodedata import numeric
from time import sleep

def uni_to_num (unicode):
    '''
    Given string of unicode, convert into numerics.
    '''
    
    if ('(' in unicode):
       return unicode
    elif len(unicode) == 0:
        return None
    elif len(unicode) == 1:
        num = numeric(unicode)
    elif unicode[-1].isdigit():
        # normal number, ending in [0-9]
        num = float(unicode)
    else:
        # Assume the last character is a vulgar fraction
        num = float(unicode[:-1]) + numeric(unicode[-1])

    return num


def get_player (soup):
    '''
    Given BeautifulSoup object with HTML code, return player name 
    '''
    
    t = soup.title.text
    return t[:t.find(' KBO')]
    


def get_website (url):
    '''
    Given string containing url of website, return BeautifulSoup object with parsed HTML code
    '''

    # Make sure to have ChromeDriver
    options = webdriver.ChromeOptions()

    options.add_argument('--headless')
    options.add_argument("--incognito")
    options.add_argument("--nogpu")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1280,1280")
    options.add_argument("--no-sandbox")
    options.add_argument("--enable-javascript")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument('--disable-blink-features=AutomationControlled')

    ua = UserAgent()
    userAgent = ua.random

    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})

    driver.get(url)

    return BeautifulSoup(driver.page_source,"html.parser")



def fix_team_names (df):
    '''
    Given dataframe with player information, adjust Team names column to represent the full team name
    '''
    teams = {'Doosan': 'Doosan Bears', 
            'Hanwha': 'Hanwha Eagles', 
            'Kia': 'Kia Tigers', 
            'Kiwoom': 'Kiwoom Heroes', 
            'KT': 'KT Wiz', 
            'LG': 'LG Twins', 
            'Lotte': 'Lotte Giants', 
            'NC': 'NC Dinos', 
            'Samsung': 'Samsung Lions', 
            'SSG': 'SSG Landers'}
    
    for i in df.index:
        df.loc[i, 'Team'] = teams[df.loc[i, 'Team']]



#count which pages have been scraped
count = 0

for i in range(2000,2002):
    pitchdata = None
    batdata = None

    pitchtemp = []
    battemp = []
    
    url = "https://mykbostats.com/players/" + str(i)
    soup = get_website(url)

    player = get_player(soup)

    # go through all the tables in the page
    for r in soup.find_all('table'):
        #ignore the pitching-games and batting-games tables
        if ('games' in r.thead['name']):
           break  

        # Go through the table bodies
        for tb in r.select('tbody tr'):
            # skip the empty or career tables
            if (tb.select_one('.left') == None or tb.select_one('.left').text.strip() == 'Career'):
               continue
           
            #Extracting table information
            # make temp list for each row, init with player name, and first two column entries
            t = [player, tb.select('td')[0].text, tb.select('td')[1].text.strip()]
            # go through rest of the row
            for q in tb.select('td')[2:]:
                if (q.text == ''):
                    t.append(None)
                else:
                    t.append(uni_to_num(q.text.strip()))
                
                # sort data accordingly
            if (r.thead['name'] == 'pitching'):
                pitchtemp.append(t)
            if (r.thead['name']=='batting'):
                battemp.append(t)
            
        # make dataframes
        headers = ['Name'] + r.thead.get_text().split()
        if (r.thead['name']=='pitching'):
            pitchdata = pd.DataFrame(data=pitchtemp, columns=headers)
            fix_team_names(pitchdata)
            pitchdata.to_csv('KBO_Pitchers.csv', header=(i <= 2), index=False, mode='a')

        if (r.thead['name']=='batting'):
            batdata = pd.DataFrame(data=battemp, columns=headers)
            fix_team_names(batdata)
            batdata.to_csv('KBO_Batters.csv', header=(i <= 2), index=False, mode='a')
    
    # header will be included in csv only during initialization
    # first two pages should be pitcher and batter, respectively

    count += 1

    sleep(5) #website crawl-delay



The chromedriver version (133.0.6943.141) detected in PATH at chromedriver.EXE might not be compatible with the detected chrome version (134.0.6998.35); currently, chromedriver 134.0.6998.35 is recommended for chrome 134.*, so it is advised to delete the driver in PATH and retry
The chromedriver version (133.0.6943.141) detected in PATH at chromedriver.EXE might not be compatible with the detected chrome version (134.0.6998.35); currently, chromedriver 134.0.6998.35 is recommended for chrome 134.*, so it is advised to delete the driver in PATH and retry
