## Initial Setup

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as BSoup
from bs4 import Comment
from helper.session import Session
import csv
import os

In [2]:
session = Session()

In [3]:
front_url = "https://www.baseball-reference.com" # front part of URL in which hrefs will be concatenated on.

## Dup Check Setup

In [4]:
# initial header row setup
file_path = '../data/br_player.csv'

if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        header = ["name_eng","date_of_birth", "url"]
        writer.writerow(header)

In [5]:
# initial header row setup
file_path = '../data/br_finished_team_year.csv'

if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        header = ["name", "year"]
        writer.writerow(header)

In [6]:
# to filter duplicate
# both removed the header row
added_team_and_year = set([tuple(row) for row in csv.reader(open('../data/br_finished_team_year.csv', 'r'))][1:])
added_player = set([tuple(row) for row in csv.reader(open('../data/br_player.csv', 'r'))][1:])

## Main Code

In [7]:
df = pd.read_csv('../data/br_team_year.csv')
team_urls = df.values.tolist() # all urls of KBO teams

In [8]:
for name, year, url in team_urls:

    if (name, year) in added_team_and_year:
        print(f"already done, skip over name: {name}, year: {year}")
        continue
    
    team_data = session.fetch(url).content.decode("utf-8") # eacm team's data
    team_html = BSoup(team_data, "lxml")
    comment_wrapper = team_html.find(id="all_standard_roster") # tag that contains desired table as a comment inside
    comment = comment_wrapper.find(string=lambda text: isinstance(text, Comment)) # extract comment
    players = BSoup(comment, "lxml").find_all("tr")[1:] # convert to HTML without header row

    with open('../data/br_player.csv', 'a', newline='') as players_info:

        writer = csv.writer(players_info)
    
        for player in players:
            player_link = player.find('a') # ex) <a href="/register/player.fcgi?id=ahn---000seu">Seung Han Ahn</a>
    
            name_eng = player_link.text
            player_url = front_url + player_link['href']
            
            player_data = session.fetch(player_url).content.decode("utf-8")
            player_html = BSoup(player_data, "lxml")
            dob = player_html.find(id = "necro-birth")['data-birth']
    
            player_row = (name_eng, dob, player_url)
                
            if player_row not in added_player: # filter based on the unique id in the href link
                added_player.add(player_row) 
                writer.writerow(player_row) 
                print("newly added: ", player_row)
            else:
                print("dup skipped: ", player_row)


    print(f"Finished processing {name} {year}.")

    with open('../data/br_finished_team_year.csv', 'a', newline='') as done_team_year:
        writer2 = csv.writer(done_team_year)
        writer2.writerow((name, year))
        added_team_and_year.add((name, year))

newly added:  ('Seung Han Ahn', '1992-01-25', 'https://www.baseball-reference.com/register/player.fcgi?id=ahn---000seu')
newly added:  ('Raúl Alcántara', '1992-12-04', 'https://www.baseball-reference.com/register/player.fcgi?id=alcant001rau')
newly added:  ('Jordan Balazovic', '1998-09-17', 'https://www.baseball-reference.com/register/player.fcgi?id=balazo000jor')
newly added:  ('Kyu Bin Chang', '2001-04-21', 'https://www.baseball-reference.com/register/player.fcgi?id=chang-001kyu')
newly added:  ('Ji Kang Choi', '2001-07-23', 'https://www.baseball-reference.com/register/player.fcgi?id=choi--000jik')
newly added:  ('Jong In Choi', '2001-05-01', 'https://www.baseball-reference.com/register/player.fcgi?id=choi--002jon')
newly added:  ('Jun Ho Choi', '2004-06-03', 'https://www.baseball-reference.com/register/player.fcgi?id=choi--002jun')
newly added:  ('Seung Yong Choi', '2001-05-11', 'https://www.baseball-reference.com/register/player.fcgi?id=choi--004seu')
have called 10 apis: 1 min bre

KeyboardInterrupt: 