In [2]:
import sqlalchemy
import json

with open("./db_config.json", "r") as f:
    config = json.load(f)

USER = config["DB_USER"]
PW = config["DB_PASSWORD"]
HOST = config["DB_HOST"]
PORT = config["DB_PORT"]
DB = config["DB_NAME"]

# DB 연결
engine = sqlalchemy.create_engine(f"mysql+pymysql://{USER}:{PW}@{HOST}:{PORT}/{DB}")

with engine.connect() as conn:
    result = conn.execute(sqlalchemy.text("SELECT VERSION();"))
    print("Connect Success. Version is", result.fetchone())


Connect Success. Version is ('8.4.4',)


In [3]:
# 경기장 정보를 가져옵니다.
import pandas as pd

strQuery = "select * from stadium"
stadium_df = pd.read_sql(strQuery, engine)

stadium_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   stadium_code  60 non-null     object
 1   sports_type   60 non-null     object
 2   stadium_name  60 non-null     object
 3   region        60 non-null     object
 4   address       60 non-null     object
dtypes: object(5)
memory usage: 2.5+ KB


In [4]:
# 경기 정보를 가져옵니다.
strQuery = "select * from sports_game"
games_df = pd.read_sql(strQuery, engine)
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3631 entries, 0 to 3630
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   game_id        3631 non-null   int64          
 1   stadium_code   3631 non-null   object         
 2   sports_type    3631 non-null   object         
 3   game_date      3631 non-null   object         
 4   day_of_week    3631 non-null   object         
 5   is_holiday     0 non-null      object         
 6   start_time     3631 non-null   timedelta64[ns]
 7   end_time       3631 non-null   timedelta64[ns]
 8   home_team_win  3631 non-null   int64          
 9   match_type     3610 non-null   object         
 10  audience       0 non-null      object         
dtypes: int64(2), object(7), timedelta64[ns](2)
memory usage: 312.2+ KB


In [5]:
# 경기 정보를 가져옵니다.
strQuery = "select * from weather"
weather_df = pd.read_sql(strQuery, engine)
weather_df.head()

Unnamed: 0,weather_id,stadium_code,region,weather_date,weather_time,temperature,precipitation,wind_speed,wind_dir_deg,humidity,snow_depth,cloud_amount,low_cloud_amt,visibility
0,1,CC01,춘천,2023-01-01,0 days 00:00:00,-2.5,,0.6,340.0,88.0,1.4,9.0,,630.0
1,2,CC01,춘천,2023-01-01,0 days 01:00:00,-2.8,,0.5,360.0,89.0,1.4,0.0,,561.0
2,3,CC01,춘천,2023-01-01,0 days 02:00:00,-3.0,,0.4,0.0,89.0,1.5,0.0,,598.0
3,4,CC01,춘천,2023-01-01,0 days 03:00:00,-3.2,,0.2,0.0,90.0,1.5,0.0,,513.0
4,5,CC01,춘천,2023-01-01,0 days 04:00:00,-4.7,,0.5,360.0,93.0,1.5,0.0,,268.0


In [6]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450811 entries, 0 to 450810
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype          
---  ------         --------------   -----          
 0   weather_id     450811 non-null  int64          
 1   stadium_code   450811 non-null  object         
 2   region         450811 non-null  object         
 3   weather_date   450811 non-null  object         
 4   weather_time   450811 non-null  timedelta64[ns]
 5   temperature    450532 non-null  float64        
 6   precipitation  151195 non-null  float64        
 7   wind_speed     446194 non-null  float64        
 8   wind_dir_deg   447979 non-null  float64        
 9   humidity       417416 non-null  float64        
 10  snow_depth     8036 non-null    float64        
 11  cloud_amount   333289 non-null  float64        
 12  low_cloud_amt  315877 non-null  float64        
 13  visibility     330832 non-null  float64        
dtypes: float64(9), int64(1), object(3), 

In [7]:
# 경기 정보를 가져옵니다.
strQuery = "select * from traffic_accident"
traffic_accident_df = pd.read_sql(strQuery, engine)
print(len(traffic_accident_df))
traffic_accident_df.head()


34032


Unnamed: 0,accident_id,region,stadium_code,accident_date,accident_count,death_count,injury_count
0,1,강원 강릉시,GN01,2023-01-01,5,0,10
1,2,강원 강릉시,GN01,2023-01-02,2,0,2
2,3,강원 강릉시,GN01,2023-01-03,3,0,4
3,4,강원 강릉시,GN01,2023-01-04,3,0,4
4,5,강원 강릉시,GN01,2023-01-05,1,0,1


<h3>1차 데이터셋 구성</h3>



In [8]:
# 모든 시간을 XX:00:00 으로 바꾸는 함수.
def round_by_half_hour(td):
    hour = td.components.hours
    minute = td.components.minutes

    if minute < 30:
        return pd.to_timedelta(f"{hour}:00:00")
    else:
        return pd.to_timedelta(f"{hour + 1}:00:00")

In [9]:
# 1차 데이터셋 구성.

# 경기장 → 지역 매핑
ds1_df = games_df.merge(stadium_df[['stadium_code','region']], on='stadium_code', how='left')

# 닐짜, 지역 기준 사고건수. 부상자수, 사망자수 를 가져온다. 
ds1_df = ds1_df.merge(traffic_accident_df[['region','accident_date', 'accident_count', 'injury_count', 'death_count']], 
                                                left_on=['region', 'game_date'],
                                                right_on=['region', 'accident_date'], 
                                                how='left')

# 필요없는 컬럼 삭제..
ds1_df = ds1_df.drop(columns=['accident_date', 'end_time'])

# 시작 시간으로부터 기장정보를 가져오기 위한 비교시간을 계산한다.
ds1_df['start_time_cmp'] = ds1_df['start_time'].apply(round_by_half_hour)

# 날씨정보를 가져온다.
# 먼저 full merge 수행 (조건을 일단 무시하고 전체 cross join에 가까운 병합 수행)
ds1_df = ds1_df.merge(
    weather_df[['weather_date', 'weather_time', 'region', 'temperature', 'precipitation', 'snow_depth']],
    left_on=['game_date', 'start_time_cmp'],
    right_on=['weather_date', 'weather_time'],
    how='left',
    suffixes=('', '_weather')
)

# region_weather(문자열)이 region(문자열)에 포함 여부로 필터링
ds1_df = ds1_df[
    ds1_df.apply(lambda row: row['region_weather'] in row['region'], axis=1)
]

# 필요없는 칼럼 삭제
ds1_df = ds1_df.drop(columns=['weather_date', 'weather_time', 'region_weather', 'start_time_cmp'])

# NaN 데이타 0으로 치환
ds1_df = ds1_df.fillna(0)

# 상위 데이타 몇 개 출력
print(ds1_df.head())

# csv 로 1차 데이타셋 저장
ds1_df.to_csv("1st-dataset.csv", index=False, encoding='utf-8')

     game_id stadium_code sports_type   game_date day_of_week  is_holiday  \
7          1         DJ03          야구  2023-03-13           월           0   
28         2         SO04          야구  2023-03-13           월           0   
63         3         CW01          야구  2023-03-13           월           0   
91         4         BS02          야구  2023-03-13           월           0   
112        5         DG03          야구  2023-03-13           월           0   

         start_time  home_team_win match_type  audience  region  \
7   0 days 14:00:00              1       시범경기         0   대전 중구   
28  0 days 14:00:00              0       시범경기         0  서울 구로구   
63  0 days 14:00:00              0       시범경기         0  경남 창원시   
91  0 days 14:00:00              0       시범경기         0  부산 동래구   
112 0 days 14:00:00              1       시범경기         0  대구 수성구   

     accident_count  injury_count  death_count  temperature  precipitation  \
7               3.0           4.0          0.0          

  ds1_df = ds1_df.fillna(0)


In [84]:
ds1_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3577 entries, 7 to 93416
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   game_id         3577 non-null   int64          
 1   stadium_code    3577 non-null   object         
 2   sports_type     3577 non-null   object         
 3   game_date       3577 non-null   object         
 4   day_of_week     3577 non-null   object         
 5   is_holiday      0 non-null      object         
 6   start_time      3577 non-null   timedelta64[ns]
 7   home_team_win   3577 non-null   int64          
 8   match_type      3556 non-null   object         
 9   audience        0 non-null      object         
 10  region          3577 non-null   object         
 11  accident_count  2885 non-null   float64        
 12  temperature     3577 non-null   float64        
 13  precipitation   690 non-null    float64        
 14  snow_depth      35 non-null     float64     