In [205]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from collections import OrderedDict

In [269]:
def get_data(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find_all("table")[0]

    tbody = table.find('tbody')
    if tbody:
        rows = tbody.find_all('tr')[2:]  # 헤더 두 줄 제외, 마지막 줄 제외

        data_only = []
        for row in rows:
            cols = row.find_all('td')
            if cols:
                row_data = []
                for i, col in enumerate(cols[1:]):  # Rank 컬럼 제외, 마지막 컬럼 제외
                    if i == 2: # 네 번째 컬럼(인덱스 2) 스킵
                        continue
                    if 'teams' in col.decode():
                        img_tag = col.find('img')
                        if img_tag:
                            team_logo_src = img_tag.get('src')
                            position = col.find_all('span')[-1].text.strip() if col.find_all('span') else ''
                            row_data.append(team_logo_src)
                            row_data.append(position)
                        else:
                            row_data.append('')
                            row_data.append('')
                    else:
                        row_data.append(col.text.strip())
                data_only.append(row_data)

        for row in data_only:
            print(row)
        return data_only
    else:
        print("웹 페이지에서 테이블 본문을 찾을 수 없습니다.")

In [255]:
def get_header(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    table = soup.find_all("table")[0]
    tbody = table.find('tbody')
    header_html = tbody.find_all('tr')[:2]
    header_str_list = [str(item) for item in header_html]
    print(header_str_list)
    soup_new = BeautifulSoup("".join(header_str_list), 'html.parser')

    stat_abbreviations_new = []
    war = None  # WAR 값을 저장할 변수
    for th in soup_new.find_all('th'):
        text = th.text.strip()
        if 'tooltip' in th.attrs:
            tooltip = th['tooltip']
            parts = tooltip.split(', ')
            abbreviation = text.replace('▼', '').strip()
            if abbreviation and abbreviation not in ['Sort', '비율']:
                if abbreviation == 'WAR':
                    war = abbreviation
                else:
                    stat_abbreviations_new.append(abbreviation)
        elif th.find('div', class_='th_tit') and 'tooltip' in th.find('div', class_='th_tit').attrs:
            tooltip = th.find('div', class_='th_tit')['tooltip']
            parts = tooltip.split(', ')
            abbreviation = text.replace('▼', '').strip()
            if abbreviation and abbreviation not in ['Sort', '비율']:
                if abbreviation == 'WAR':
                    war = abbreviation
                else:
                    stat_abbreviations_new.append(abbreviation)
        elif text and text not in ['Rank', 'Name', 'Team', 'Sort▼', '비율']:
            if text == 'WAR':
                war = text
            else:
                stat_abbreviations_new.append(text)

    # 최종적으로 중복 제거 (순서 유지)
    stat_abbreviations_new = list(OrderedDict.fromkeys(stat_abbreviations_new))

    header = ['Name', 'Team', 'Position'] + stat_abbreviations_new
    if war:
        header.append(war)
    return header

In [263]:
def get_csv_data(filename, url):
    data = get_data(url)
    header_list = get_header(url)

    df = pd.DataFrame(data, columns=header_list)

    # 팀 로고 URL과 팀명 매칭 딕셔너리
    team_logo_match = {
        "/data/team/ci/2025/5002.svg": "LG",
        "/data/team/ci/2025/1001.svg": "삼성",
        "/data/team/ci/2025/9002.svg": "SSG",
        "/data/team/ci/2025/2002.svg": "기아",
        "/data/team/ci/2025/10001.svg": "키움",
        "/data/team/ci/2025/6002.svg": "두산",
        "/data/team/ci/2025/3001.svg": "롯데",
        "/data/team/ci/2025/11001.svg": "엔씨",
        "/data/team/ci/2025/12001.svg": "KT",
        "/data/team/ci/2025/7002.svg": "한화"
    }

    # 'Team' 컬럼의 값을 매칭되는 팀명으로 변경
    df['Team'] = df['Team'].map(team_logo_match).fillna(df['Team'])

    # 변경된 DataFrame을 새로운 CSV 파일로 저장 (기존 파일 덮어쓰거나 다른 이름으로 저장 가능)
    df.to_csv(f'{filename}.csv', encoding='utf-8-sig', index=False)

    print(f"Team 컬럼의 내용을 매칭되는 팀명으로 변경하여 {filename}.csv 파일로 저장했습니다.")

In [272]:
hitter_url = 'https://statiz.sporki.com/stats/?m=main&m2=batting&m3=default&so=&ob=&year=2025&sy=&ey=&te=&po=&lt=10100&reg=A&pe=&ds=&de=&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=300&ph=&hs=&us=&na=&ls=1&sf1=G&sk1=&sv1=&sf2=G&sk2=&sv2='
pitcher_url = 'https://statiz.sporki.com/stats/?m=main&m2=pitching&m3=default&so=G&ob=DESC&year=2025&sy=&ey=&te=&po=&lt=10100&reg=A&pe=&ds=&de=&we=&hr=&ha=&ct=&st=&vp=&bo=&pt=&pp=&ii=&vc=&um=&oo=&rr=&sc=&bc=&ba=&li=&as=&ae=&pl=&gc=&lr=&pr=300&ph=&hs=&us=&na=&ls=1&sf1=G&sk1=&sv1=&sf2=G&sk2=&sv2='

get_csv_data("kbo_batting_stats_2025", hitter_url)
get_csv_data("kbo_pitching_stats_2025", pitcher_url)

['문보경', '/data/team/ci/2025/5002.svg', '3B', '20', '1.36', '0.26', '89', '88', '75', '18', '28', '3', '0', '5', '46', '20', '0', '0', '12', '0', '2', '15', '2', '0', '2', '0.373', '0.449', '0.613', '1.062', '0.231', '215.6', '1.62']
['박동원', '/data/team/ci/2025/5002.svg', 'C', '20', '1.60', '-0.01', '73', '73', '60', '15', '21', '3', '0', '5', '39', '16', '1', '0', '12', '1', '0', '14', '1', '0', '0', '0.350', '0.466', '0.650', '1.116', '0.260', '242.9', '1.59']
['이재현', '/data/team/ci/2025/1001.svg', 'SS', '21', '1.11', '0.14', '92', '90', '71', '19', '18', '5', '0', '3', '32', '12', '1', '0', '17', '3', '1', '21', '0', '1', '0', '0.254', '0.418', '0.451', '0.869', '0.187', '157.0', '1.24']
['박성한', '/data/team/ci/2025/9002.svg', 'SS', '17', '0.85', '0.32', '71', '70', '57', '8', '15', '5', '0', '1', '23', '10', '1', '0', '12', '1', '1', '14', '0', '1', '0', '0.263', '0.400', '0.404', '0.804', '0.163', '137.4', '1.17']
['위즈덤', '/data/team/ci/2025/2002.svg', '1B', '20', '1.15', '-0.03', '