## Initial Setup

In [1]:
import pandas as pd
import csv
import os
import re
from bs4 import BeautifulSoup as BSoup
from bs4 import Comment
from helper.session import Session
from helper.url_parser import get_ct_code_and_year, team_url_to_back_number_url

In [2]:
session = Session(1000000) # doesn't really matter for statiz

In [3]:
initial_url = "https://statiz.sporki.com/team/" # starting point
front_url = "https://statiz.sporki.com" # front part of URL in which hrefs will be concatenated on.

## Dup Check Setup

In [4]:
# initial header row setup
file_path = '../data/st_player.csv'

if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        header = ["name_kor", "name_eng", "date_of_birth", "url"]
        writer.writerow(header)

In [5]:
# initial header row setup 2
file_path = '../data/st_finished_team_year.csv'

if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
    with open(file_path, 'w', newline='') as file2:
        writer2 = csv.writer(file2)
        header = ["code", "year"]
        writer2.writerow(header)

In [6]:
# to filter duplicate
# both removed the header row
added_team_and_year = set([tuple(row) for row in csv.reader(open('../data/st_finished_team_year.csv', 'r'))][1:])
added_player = set([tuple(row) for row in csv.reader(open('../data/st_player.csv', 'r'))][1:])

In [7]:
# print(added_team_and_year)
# print(len(added_team_and_year))
# print(len(added_player))

## Main Code

In [8]:
teams_data = session.fetch(initial_url).content.decode("utf-8") # page that contains all teams' links
teams_html = BSoup(teams_data, "lxml")
teams = teams_html.find(class_="team_list").find_all(class_="select_con")

In [9]:
for team in teams:
    
    years = team.find_all("a")

    for year in years:
        
        current_ct_code, current_year = get_ct_code_and_year(year['href'])
        # print(current_ct_code, current_year)
        # skip over the fully processed year with the given team.
        if (str(current_ct_code), str(current_year)) in added_team_and_year:
            print(f"already done, skip over code:{current_ct_code}, year: {current_year}")
            continue
        team_url = team_url_to_back_number_url(year['href'])
        team_data = session.fetch(team_url).content.decode("utf-8")
        team_html = BSoup(team_data, "lxml")
        players = team_html.find_all(class_="item away")[2:]


        with open("../data/st_player.csv", 'a', newline='') as players_info:
                
            writer = csv.writer(players_info)

            for player in players:
                player_url = front_url + player.find("a")['href']
                player_data = session.fetch(player_url).content.decode("utf-8")
                player_html = BSoup(player_data, "lxml")

                # segment that contains Korean and English name of the player.
                player_name = player_html.find(class_="name")
                # sometimes there is a blank page that doesn't contain any info about the player.
                if not player_name:
                    continue
                player_name = player_name.text.strip()
                name_kor = player_name.split('(')[0].strip()
                name_eng = player_name.split('(')[1][:-1]

                # segment that contains the date of birth of the player.
                dob = player_html.find(class_="man_info").find("li").text
                dob = re.findall(r'\d+', dob)
                dob = "-".join(dob)
                
                player_row = (name_kor, name_eng, dob, player_url)
    
                if player_row not in added_player:
                    added_player.add(player_row)
                    writer.writerow(player_row)
                    print("newly added: ", player_row)
                else:
                    print("dup skipped: ", player_row)

            print(f"finished code:{current_ct_code}, year: {current_year}")

        with open('../data/st_finished_team_year.csv', 'a', newline='') as done_team_year:
            writer2 = csv.writer(done_team_year)
            writer2.writerow((current_ct_code, current_year))
            added_team_and_year.add(((str(current_ct_code), str(current_year))))

already done, skip over code:2, year: 1982
already done, skip over code:2, year: 1983
already done, skip over code:2, year: 1984
already done, skip over code:2, year: 1985
already done, skip over code:2, year: 1986
already done, skip over code:2, year: 1987
already done, skip over code:2, year: 1988
already done, skip over code:2, year: 1989
already done, skip over code:2, year: 1990
already done, skip over code:2, year: 1991
already done, skip over code:2, year: 1992
already done, skip over code:2, year: 1993
already done, skip over code:2, year: 1994
already done, skip over code:2, year: 1995
already done, skip over code:2, year: 1996
already done, skip over code:2, year: 1997
already done, skip over code:2, year: 1998
already done, skip over code:2, year: 1999
already done, skip over code:2, year: 2000
already done, skip over code:2, year: 2001
already done, skip over code:2, year: 2002
already done, skip over code:2, year: 2003
already done, skip over code:2, year: 2004
already don

KeyboardInterrupt: 