---
- 작성자 : 이휘
- 작성일 : 2024-06-04
---

In [2]:
# 데이터 프레임과 계산을 위한 라이브러리
import pandas as pd
import numpy as np
# 차트
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 차트 경고 메세지 무시
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [4]:
# 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import matplotlib.pyplot as plt
import platform

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

In [5]:
# 날짜, 요일, 홈팀, 원정팀, 장소, 관중 수 데이터 불러오기
data_01 = pd.read_csv("./Data/kiwoom_heroes_info(2022~2024).csv")
data_01.head()

Unnamed: 0,날짜,요일,홈팀,원정팀,장소,관중 수
0,2022-04-02,토,키움,롯데,고척,8257
1,2022-04-03,일,키움,롯데,고척,6115
2,2022-04-05,화,키움,LG,고척,2298
3,2022-04-06,수,키움,LG,고척,2304
4,2022-04-07,목,키움,LG,고척,2055


In [7]:
# 속성 확인
data_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   날짜      173 non-null    object
 1   요일      173 non-null    object
 2   홈팀      173 non-null    object
 3   원정팀     173 non-null    object
 4   장소      173 non-null    object
 5   관중 수    173 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 8.2+ KB


In [8]:
# 날짜 데이터를 datetime으로 type으로 변환할 수 있게 정규식 사용하여 데이터 변환
data_01['날짜'] = data_01['날짜'].str.replace(r'\([가-힣]\)', '', regex=True)
# 날짜 컬럼을 datetime으로 type 변환
data_01['날짜'] = pd.to_datetime(data_01['날짜'])
data_01.head()

Unnamed: 0,날짜,요일,홈팀,원정팀,장소,관중 수
0,2022-04-02,토,키움,롯데,고척,8257
1,2022-04-03,일,키움,롯데,고척,6115
2,2022-04-05,화,키움,LG,고척,2298
3,2022-04-06,수,키움,LG,고척,2304
4,2022-04-07,목,키움,LG,고척,2055


In [9]:
# 속성 확인
data_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      173 non-null    datetime64[ns]
 1   요일      173 non-null    object        
 2   홈팀      173 non-null    object        
 3   원정팀     173 non-null    object        
 4   장소      173 non-null    object        
 5   관중 수    173 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 8.2+ KB


In [10]:
# 날짜, 요일, 홈팀, 원정팀, 장소, 관중 수 데이터 불러오기
data_02 = pd.read_csv("./Data/test.csv")
data_02.head()

Unnamed: 0,관중 수,요일,원정팀,시작시간
0,8257,토,롯데,14:00
1,6115,일,롯데,14:00
2,2298,화,LG,18:30
3,2304,수,LG,18:30
4,2055,목,LG,18:30


In [11]:
# 필요한 데이터 병합하기
kiwoom = pd.concat([data_01['날짜'],data_01['요일'],data_01['원정팀'],data_01['관중 수'],data_02['시작시간'],],axis=1)
kiwoom

Unnamed: 0,날짜,요일,원정팀,관중 수,시작시간
0,2022-04-02,토,롯데,8257,14:00
1,2022-04-03,일,롯데,6115,14:00
2,2022-04-05,화,LG,2298,18:30
3,2022-04-06,수,LG,2304,18:30
4,2022-04-07,목,LG,2055,18:30
...,...,...,...,...,...
168,2024-05-22,수,NC,4337,18:30
169,2024-05-23,목,NC,3897,18:30
170,2024-05-31,금,SSG,5680,18:30
171,2024-06-01,토,SSG,10462,17:00


In [12]:
# 속성 확인
kiwoom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      173 non-null    datetime64[ns]
 1   요일      173 non-null    object        
 2   원정팀     173 non-null    object        
 3   관중 수    173 non-null    int64         
 4   시작시간    173 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 6.9+ KB


In [13]:
# 연도, 월 컬럼으로 분리하기
kiwoom['연도'] = kiwoom['날짜'].dt.year
kiwoom['월'] = kiwoom['날짜'].dt.month
kiwoom

Unnamed: 0,날짜,요일,원정팀,관중 수,시작시간,연도,월
0,2022-04-02,토,롯데,8257,14:00,2022,4
1,2022-04-03,일,롯데,6115,14:00,2022,4
2,2022-04-05,화,LG,2298,18:30,2022,4
3,2022-04-06,수,LG,2304,18:30,2022,4
4,2022-04-07,목,LG,2055,18:30,2022,4
...,...,...,...,...,...,...,...
168,2024-05-22,수,NC,4337,18:30,2024,5
169,2024-05-23,목,NC,3897,18:30,2024,5
170,2024-05-31,금,SSG,5680,18:30,2024,5
171,2024-06-01,토,SSG,10462,17:00,2024,6


In [14]:
# '요일' 열을 One-Hot Encoding으로 변환.
day_of_week_encoding = pd.get_dummies(kiwoom['요일'], drop_first=True)
kiwoom = kiwoom.join(day_of_week_encoding)
# '원정팀' 열을 One-Hot Encoding으로 변환.
away_encoding = pd.get_dummies(kiwoom['원정팀'], drop_first=True)
kiwoom = kiwoom.join(away_encoding)
# '시작시간' 열을 One-Hot Encoding으로 변환.
start_time_encoding = pd.get_dummies(kiwoom['시작시간'], drop_first=True)
kiwoom = kiwoom.join(start_time_encoding)
# '연도' 열을 One-Hot Encoding으로 변환.
year_encoding = pd.get_dummies(kiwoom['연도'], drop_first=True)
kiwoom = kiwoom.join(year_encoding)
# '연도' 열을 One-Hot Encoding으로 변환.
month_encoding = pd.get_dummies(kiwoom['월'], drop_first=True)
kiwoom = kiwoom.join(month_encoding)

In [15]:
# One-Hot Encoding 부분을 boolean에서 int로 변환
kiwoom.iloc[:,7:] = kiwoom.iloc[:,7:].astype('int')
kiwoom

Unnamed: 0,날짜,요일,원정팀,관중 수,시작시간,연도,월,목,수,일,...,18:30,2023,2024,4,5,6,7,8,9,10
0,2022-04-02,토,롯데,8257,14:00,2022,4,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2022-04-03,일,롯데,6115,14:00,2022,4,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,2022-04-05,화,LG,2298,18:30,2022,4,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2022-04-06,수,LG,2304,18:30,2022,4,0,1,0,...,1,0,0,1,0,0,0,0,0,0
4,2022-04-07,목,LG,2055,18:30,2022,4,1,0,0,...,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,2024-05-22,수,NC,4337,18:30,2024,5,0,1,0,...,1,0,1,0,1,0,0,0,0,0
169,2024-05-23,목,NC,3897,18:30,2024,5,1,0,0,...,1,0,1,0,1,0,0,0,0,0
170,2024-05-31,금,SSG,5680,18:30,2024,5,0,0,0,...,1,0,1,0,1,0,0,0,0,0
171,2024-06-01,토,SSG,10462,17:00,2024,6,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [16]:
# Target인 관중 수를 제외한 나머지 원본 Feature 부분 제외하고 데이터 불러오기
kiwoom_new = pd.concat([kiwoom['관중 수'],kiwoom.iloc[:,7:]],axis=1)
kiwoom_new

Unnamed: 0,관중 수,목,수,일,토,화,KT,LG,NC,SSG,...,18:30,2023,2024,4,5,6,7,8,9,10
0,8257,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,6115,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2298,0,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2304,0,1,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
4,2055,1,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,4337,0,1,0,0,0,0,0,1,0,...,1,0,1,0,1,0,0,0,0,0
169,3897,1,0,0,0,0,0,0,1,0,...,1,0,1,0,1,0,0,0,0,0
170,5680,0,0,0,0,0,0,0,0,1,...,1,0,1,0,1,0,0,0,0,0
171,10462,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0


In [18]:
# column 이름 문자열로 바꾸기
kiwoom_new.columns = kiwoom_new.columns.astype(str)
kiwoom_new

Unnamed: 0,관중 수,목,수,일,토,화,KT,LG,NC,SSG,...,18:30,2023,2024,4,5,6,7,8,9,10
0,8257,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,6115,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2298,0,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2304,0,1,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
4,2055,1,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,4337,0,1,0,0,0,0,0,1,0,...,1,0,1,0,1,0,0,0,0,0
169,3897,1,0,0,0,0,0,0,1,0,...,1,0,1,0,1,0,0,0,0,0
170,5680,0,0,0,0,0,0,0,0,1,...,1,0,1,0,1,0,0,0,0,0
171,10462,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0


In [17]:
# 각 column별 상관계수 확인
kiwoom_new.corr()

Unnamed: 0,관중 수,목,수,일,토,화,KT,LG,NC,SSG,...,18:30,2023,2024,4,5,6,7,8,9,10
관중 수,1.0,-0.232173,-0.197375,0.193661,0.364412,-0.155649,-0.344441,0.018838,-0.286092,0.12998,...,-0.524264,0.252373,0.313573,0.038029,-0.014576,-0.056517,-0.033572,0.012552,-0.055364,0.155518
목,-0.232173,1.0,-0.192985,-0.201389,-0.213788,-0.192985,0.031331,0.049804,0.049804,-0.072035,...,0.325014,0.03455,-0.042054,-0.00146,-0.060332,0.071087,-0.038992,0.022734,0.052165,-0.048533
수,-0.197375,-0.192985,1.0,-0.192985,-0.204866,-0.184932,0.043765,0.009951,0.009951,-0.111072,...,0.309197,-0.00262,-0.028694,0.016534,-0.046631,0.086564,-0.027659,0.13278,-0.121482,-0.046507
일,0.193661,-0.201389,-0.192985,1.0,-0.213788,-0.192985,-0.017064,-0.051562,-0.000879,0.117502,...,-0.56446,-0.028367,0.039692,-0.00146,0.057827,-0.058822,0.052165,-0.072035,0.006586,-0.048533
토,0.364412,-0.213788,-0.204866,-0.213788,1.0,-0.204866,-0.032564,-0.016065,-0.016065,0.096442,...,-0.599213,-0.004024,0.017729,0.008028,-0.003942,-0.075373,0.032696,-0.085902,0.076544,0.087746
화,-0.155649,-0.192985,-0.184932,-0.192985,-0.204866,1.0,0.043765,0.009951,0.009951,-0.111072,...,0.309197,0.029758,-0.028694,0.016534,-0.006092,0.041994,-0.027659,0.084009,-0.121482,0.102489
KT,-0.344441,0.031331,0.043765,-0.017064,-0.032564,0.043765,1.0,-0.123208,-0.123208,-0.134387,...,0.064782,0.029103,-0.022356,0.13258,-0.129523,-0.000292,0.018159,-0.134387,0.124648,-0.039101
LG,0.018838,0.049804,0.009951,-0.051562,-0.016065,0.009951,-0.123208,1.0,-0.116129,-0.126665,...,-0.001123,-0.014905,-0.00607,0.071976,-0.165449,0.174553,-0.133441,-0.126665,-0.021918,0.140251
NC,-0.286092,0.049804,0.009951,-0.000879,-0.016065,0.009951,-0.123208,-0.116129,1.0,-0.126665,...,0.037741,-0.014905,-0.00607,-0.06049,0.123669,-0.143317,0.201128,-0.126665,0.033844,-0.036854
SSG,0.12998,-0.072035,-0.111072,0.117502,0.096442,-0.111072,-0.134387,-0.126665,-0.126665,1.0,...,-0.213179,-0.058237,0.110255,-0.210464,0.270039,-0.00773,-0.041284,0.132832,-0.093416,-0.040198


In [31]:
# Randomforest 회귀모델로 정확도율 구하기
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 데이터 준비 (x는 피쳐, y는 타겟)
x = kiwoom_new.drop('관중 수', axis=1)
y = kiwoom_new['관중 수']

# 학습 데이터와 테스트 데이터로 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# 랜덤 포레스트 회귀모델 학습
rf_regressor = RandomForestRegressor()
rf_regressor.fit(x_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = rf_regressor.predict(x_test)

# 오차 계산 (평균 제곱 오차)
mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error:", mse)

# MAPE 계산 함수 정의
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    print(y_pred)
    print(y_true)
    # return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return (1 - np.mean(np.abs((y_true - y_pred) / y_true))) * 100


# MAPE 계산
mape = mean_absolute_percentage_error(y_test, y_pred)
print("정확도율:", mape.round(0), "%")

[ 5285.9         4058.15        5922.          6981.46
  7540.41        2896.65666667  2757.95       13132.17
  7821.24        7337.71       10273.          4091.26
  8547.82        6292.13383333 11921.          5711.54
 11029.89        6926.28        9902.35        9742.08
  6671.79        5557.67       12091.92        3703.58
 10777.385      10737.86        4070.45        2108.73
 10185.04        8487.55        4007.11166667  4713.1
 12936.79        1399.17        6913.60166667]
[ 4194  5009  4510  6455 10074  1342  1414 12333  8817  7439  9886  2536
  6115  8175 11822  4337 11717  8528  7715  9512  2877  3104 11851  7165
 12133 11235  3156  4289 11324  2901  7941  4474  9962   893  3897]
정확도율: 62.0 %
