In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re

In [2]:
# 선수 ID 지정 (예: 73330은 특정 선수의 고유 번호)
player_id = "11310"
url = f"https://statiz.sporki.com/player/?m=year&p_no={player_id}"


In [3]:
def crawl_statiz_pitcher_data(url):
    """
    스탯티즈에서 선발투수 등판 기록을 크롤링하는 함수
    """
    try:
        # HTTP 요청 헤더 설정 (봇 차단 방지)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # 웹페이지 요청
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # BeautifulSoup 객체 생성
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 테이블 찾기 (일반적으로 통계 데이터는 table 태그 안에 있음)
        table = soup.find('table')
        if not table:
            print("테이블을 찾을 수 없습니다.")
            return None
        
        # 헤더 추출 (th 태그 또는 첫 번째 tr의 td 태그)
        headers = []
        header_row = table.find('tr')
        if header_row:
            header_cells = header_row.find_all(['th', 'td'])
            headers = [cell.get_text(strip=True) for cell in header_cells]
        
        # 데이터 행 추출
        data_rows = []
        rows = table.find_all('tr')[1:]  # 첫 번째 행(헤더) 제외
        
        for row in rows:
            cells = row.find_all('td')
            if cells:  # td 태그가 있는 행만 처리
                row_data = []
                for cell in cells:
                    # 셀 내용 추출 (줄바꿈 제거, 공백 정리)
                    text = cell.get_text(separator=' ', strip=True)
                    # 추가 줄바꿈 문자 제거
                    text = text.replace('\n', ' ').replace('\r', '').strip()
                    # 연속된 공백을 하나로 통일
                    text = ' '.join(text.split())
                    row_data.append(text)
                
                if row_data:  # 빈 행이 아닌 경우만 추가
                    data_rows.append(row_data)
        
        return headers, data_rows
        
    except requests.RequestException as e:
        print(f"웹페이지 요청 중 오류 발생: {e}")
        return None
    except Exception as e:
        print(f"데이터 추출 중 오류 발생: {e}")
        return None

In [16]:
picherdata=crawl_statiz_pitcher_data(url)

picherdata_DF=pd.DataFrame(picherdata[1])
picherdata_DF = picherdata_DF.set_axis(picherdata[0], axis=1)
picherdata_DF

# picherdata_list.append(picherdata)
# picherdata_DF = pd.DataFrame(picherdata_list)
# picherdata_DF

Unnamed: 0,Year,Team,Age,Pos.,G,GS,GR,GF,CG,SHO,...,SO,ROE,BK,WP,ERA,RA9,rRA9,FIP,WHIP,WAR
0,2015,KT,20,P,6,6,0,0,0,0,...,17,0,0,2.0,5.79,6.11,6.11,7.23,1.68,0.4
1,롯데,20,P,25,15,10,3,0,0,2,...,8,0,8,5.76,6.49,5.84,5.28,1.58,1.26,
2,2016,롯데,21,P,27,27,0,0,0,0,...,133,8,0,9.0,5.76,6.28,5.88,4.82,1.6,2.34
3,2017,롯데,22,P,28,28,0,0,0,0,...,117,5,0,7.0,3.68,3.89,3.73,5.13,1.32,6.42
4,2018,롯데,23,P,14,12,2,0,0,0,...,40,4,0,3.0,9.92,9.92,9.41,6.82,2.29,-0.31
5,2019,롯데,24,P,12,12,0,0,0,0,...,44,3,0,4.0,4.2,4.95,4.95,3.76,1.48,0.94
6,2020,롯데,25,P,28,28,0,0,0,0,...,108,8,0,13.0,4.7,5.19,5.15,4.94,1.52,2.91
7,2021,롯데,26,P,28,28,0,1,1,1,...,125,7,0,17.0,3.98,4.14,4.0,4.51,1.19,5.06
8,2022,롯데,27,P,28,28,0,0,0,0,...,146,12,0,8.0,3.89,4.58,4.43,2.84,1.34,3.31
9,2023,롯데,28,P,27,27,0,0,0,0,...,129,9,0,10.0,3.45,4.09,3.93,3.62,1.32,4.21


In [18]:
picherdata_DF.iloc[-2]

Year     2025
Team       롯데
Age        30
Pos.        P
G          14
GS         14
GR          0
GF          0
CG          0
SHO         0
W           8
L           5
S           0
HD          0
IP       83.0
ER         40
R          45
rRA     42.25
TBF       368
H          83
2B         22
3B          1
HR          6
BB         33
HP          6
IB          0
SO         87
ROE         4
BK          1
WP          8
ERA      4.34
RA9      4.88
rRA9     4.58
FIP      3.56
WHIP     1.40
WAR      1.85
Name: 11, dtype: object

In [None]:
# picherdata_list = [picherdata]
# picherdata_list.append(picherdata[0])
# picherdata_list.append(picherdata[1][0])

In [36]:
picherdata_DF = pd.DataFrame(picherdata_list)
picherdata_DF

Unnamed: 0,0,1
0,"[03월 - SS, 상대, 결과, GS, IP, R, ER, rRA, TBF, AB...","[[03-23, @ LG, L 2:10, 1, 5.0, 4, 4, 4.00, 22,..."
