# Testing
### Retrieve Site and Parse HTML

In [59]:
#%pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

options = webdriver.ChromeOptions()

options.add_argument('--headless')
options.add_argument("--incognito")
options.add_argument("--nogpu")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1280,1280")
options.add_argument("--no-sandbox")
options.add_argument("--enable-javascript")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')

ua = UserAgent()
userAgent = ua.random

driver = webdriver.Chrome(options=options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})

driver.get('https://mykbostats.com/players/2000')

soup = BeautifulSoup(driver.page_source,"html.parser")


driver.quit()

### Player Name and Team

In [63]:
t = soup.select('title')[0].text.strip()
player = t[:t.find(' - ')]

teams = {'Doosan': 'Doosan Bears', 
         'Hanwha': 'Hanwha Eagles', 
         'Kia': 'Kia Tigers', 
         'Kiwoom': 'Kiwoom Heroes', 
         'KT': 'KT Wiz', 
         'LG': 'LG Twins', 
         'Lotte': 'Lotte Giants', 
         'NC': 'NC Dinos', 
         'Samsung': 'Samsung Lions', 
         'SSG': 'SSG Landers'}

### Table Headers

In [None]:
# extract table header information
header = soup.select('thead th')
for i in range(len(header)):
  header[i] = header[i].text.strip()

# remove 'Game Stats' data
header = header[:header.index("Date")]
header = ['Name'] + header

[]


ValueError: 'Date' is not in list

### Get Table Contents 
Write to contents to dataframe, then write to CSV. 

In [58]:
#%pip install pandas
import pandas as pd
from unicodedata import numeric


#to handle unicode data in table
def uni_to_num (unicode):
    if ('(' in unicode):
       return unicode
    elif len(unicode) == 0:
        return None
    elif len(unicode) == 1:
        num = numeric(unicode)
    elif unicode[-1].isdigit():
        # normal number, ending in [0-9]
        num = float(unicode)
    else:
        # Assume the last character is a vulgar fraction
        num = float(unicode[:-1]) + numeric(unicode[-1])
    return num


#data to be inserted into dataframe later
temp = []

# Parse row data and add to temp
rows = soup.select('tbody tr')

for r in rows:
  if (r.select_one('.left').text.strip() == 'Career'):
    break

  t = [player, r.select_one('.left').text.strip(), r.select_one('nobr').text.strip()]  
  for i in r.select('td')[2:]:
    t.append(uni_to_num(i.text.strip()))
  temp.append(t)


# init dataframe
data = pd.DataFrame(data = temp, columns = header)

# write to CSV
data.to_csv('data.csv')

# read CSV
#df = pd.read_csv('.csv')

ValueError: 56 columns passed, passed data had 29 columns

# Python Script

In [None]:
#%pip install ...
import pandas as pd
from fake_useragent import UserAgent
from selenium import webdriver
from bs4 import BeautifulSoup
from unicodedata import numeric
from time import sleep

def uni_to_num (unicode):
    '''
    Given string of unicode, convert into numerics.
    '''
    
    if ('(' in unicode):
       return unicode
    elif len(unicode) == 0:
        return None
    elif len(unicode) == 1:
        num = numeric(unicode)
    elif unicode[-1].isdigit():
        # normal number, ending in [0-9]
        num = float(unicode)
    else:
        # Assume the last character is a vulgar fraction
        num = float(unicode[:-1]) + numeric(unicode[-1])

    return num



def get_row_data (rows, player, header):
    '''
    Given HTML code of table with data and tuple containing player name and role, extract data and return as dataframe.
    In the case of an empty page, prematurely return empty dataframe
    '''
    
    rows = soup.select('tbody tr')
    temp = []

    if (rows == []):
        return pd.DataFrame(data = temp, columns = header)
    
    for r in rows:
        # break loop to exclude data from Career onwards
        if (r.select_one('.left').text.strip() == 'Career'):
            break

        t = [player[0], r.select_one('.left').text.strip(), r.select_one('nobr').text.strip()]  
        for i in r.select('td')[2:]:
            t.append(uni_to_num(i.text.strip()))
        temp.append(t)

    return pd.DataFrame(data = temp, columns = header)



def get_header (soup):
    '''
    Given a BeautifulSoup object of HTML code, extract table heading information as list of strings
    In the case of an empty page, prematurely return empty list
    '''
    
    header = soup.select('thead th')
    if (header==[]):
        return []

    for i in range(len(header)):
        header[i] = header[i].text.strip()

    # remove 'Game Stats' data
    header = header[:header.index("Date")]
    
    return ['Name'] + header



def get_player (soup):
    '''
    Given BeautifulSoup object with HTML code, return tuple with player name and role (pitcher/batter) 
    '''
    
    t = soup.select('title')[0].text.strip()
    
    name = t[:t.find(' KBO')]
    
    if ("Pitching" in t):
        role = "Pitcher"
    elif ("Batting" in t):
        role = "Batter"
    
    return (name, role)



def get_website (url):
    '''
    Given string containing url of website, return BeautifulSoup object with parsed HTML code
    '''

    # Make sure to have ChromeDriver
    options = webdriver.ChromeOptions()

    options.add_argument('--headless')
    options.add_argument("--incognito")
    options.add_argument("--nogpu")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1280,1280")
    options.add_argument("--no-sandbox")
    options.add_argument("--enable-javascript")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument('--disable-blink-features=AutomationControlled')

    ua = UserAgent()
    userAgent = ua.random

    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": userAgent})

    driver.get(url)

    return BeautifulSoup(driver.page_source,"html.parser")



def fix_team_names (df):
    '''
    Given dataframe with player information, adjust Team names column to represent the full team name
    '''
    teams = {'Doosan': 'Doosan Bears', 
            'Hanwha': 'Hanwha Eagles', 
            'Kia': 'Kia Tigers', 
            'Kiwoom': 'Kiwoom Heroes', 
            'KT': 'KT Wiz', 
            'LG': 'LG Twins', 
            'Lotte': 'Lotte Giants', 
            'NC': 'NC Dinos', 
            'Samsung': 'Samsung Lions', 
            'SSG': 'SSG Landers'}
    
    for i in df.index:
        df.loc[i, 'Team'] = teams[df.loc[i, 'Team']]



#count which pages have been scraped
count = 0

for i in range(2000, 2002): 
    
    # Use to see if dataframe headers should be written to csv.
    # At time of writing, first two pages are pitcher and batter, respectively.
    # Use these two to incude header in csv then ignore headers for pages after.
    if (i <= 2):
        yeshead = True
    else:
        yeshead = True

    url = "https://mykbostats.com/players/" + str(i)
    soup = get_website(url)


    header = get_header(soup)
    if (header==[]):
        continue #skip iteration if page is empty
    player = get_player(soup)
    rows = soup.select('tbody tr')
    data = get_row_data(rows, player, header)

    fix_team_names(data)

    if (player[1] == "Pitcher"):
        data.to_csv('KBO_Pitchers.csv', header=yeshead, mode='a')

    if (player[1] == "Batter"):
        data.to_csv('KBO_Batters.csv', header=yeshead, mode='a')

    count += 1

    sleep(5) #website crawl-delay

<html lang="en"><head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="Jz0XNSgqVid9GyMPAylLDxYXIBYGbQEIREnVLMejMNQKUe-GDZHNU5PP" name="csrf-token"/>
<title>Gu Jang-ik KBO League Batting Stats - Doosan Bears | MyKBO Stats</title>
<meta content="Profile, batting stats, recent games and videos for Gu Jang-ik of the Doosan Bears (KBO League)" name="description"/>
<meta content="app-id=1107341048" name="apple-itunes-app"/>
<meta content="MyKBO Stats" property="og:site_name"/>
<meta content="website" property="og:type"/>
<meta content="http://mykbostats.com/players/2000" property="og:url"/>
<meta content="Gu Jang-ik KBO League Batting Stats - Doosan Bears" property="og:title"/>
<meta content="Profile, batting stats, recent games and videos for Gu Jang-ik of the Doosan Bears (KBO League)" property="og:description"/>
<meta content="237578156426868" property="fb:app_id"/>
<meta content="@MyKBO" name="twitter:site"/>
<meta content="summary" name="twi

ValueError: 56 columns passed, passed data had 29 columns