# DB에서 데이터 가져오기

In [70]:
import sqlalchemy
import json
import urllib.parse
from datetime import timedelta
import pandas as pd

with open("C:/ex/project/db_config.json", "r") as f:
    config = json.load(f)

USER = urllib.parse.quote_plus(config["DB_USER"])
PW = urllib.parse.quote_plus(config["DB_PASSWORD"])
HOST = config["DB_HOST"]
PORT = config["DB_PORT"]
DB = config["DB_NAME"]

# DB 연결,
engine = sqlalchemy.create_engine(f"mysql+pymysql://{USER}:{PW}@{HOST}:{PORT}/{DB}")

# 경기 데이터 조회 (1년치),

games_2023_query = """
SELECT g.*, s.region AS region_code
FROM sports_game g
JOIN stadium s ON g.stadium_code = s.stadium_code
WHERE g.game_date BETWEEN '2023-01-01' AND '2023-12-31'
"""
games_2024_query = """
SELECT g.*, s.region AS region_code
FROM sports_game g
JOIN stadium s ON g.stadium_code = s.stadium_code
WHERE g.game_date BETWEEN '2024-01-01' AND '2024-12-31'
"""
weather_2023_query = """
SELECT g.*, g.region AS region_code
FROM weather g
WHERE g.weather_date BETWEEN '2023-01-01' AND '2023-12-31'
"""
weather_2024_query = """
SELECT g.*, g.region AS region_code
FROM weather g
WHERE g.weather_date BETWEEN '2024-01-01' AND '2024-12-31'
"""
accident_2023_query = """
SELECT g.*, g.region AS region_code
FROM traffic_accident g
WHERE g.accident_date BETWEEN '2024-01-01' AND '2024-12-31'
"""
accident_2024_query = """
SELECT g.*, g.region AS region_code
FROM traffic_accident g
WHERE g.accident_date BETWEEN '2024-01-01' AND '2024-12-31'
"""
games_2023 = pd.read_sql(games_2023_query, engine, parse_dates=["start_time","end_time"])
games_2024 = pd.read_sql(games_2024_query, engine, parse_dates=["start_time","end_time"])
weather_2023 = pd.read_sql(weather_2023_query, engine, parse_dates=["weather_time","weather_date"])
weather_2024 = pd.read_sql(weather_2024_query, engine, parse_dates=["weather_time","weather_date"])
accident_2023 = pd.read_sql(accident_2023_query, engine, parse_dates=["accident_date"])
accident_2024 = pd.read_sql(accident_2024_query, engine, parse_dates=["accident_date"])

In [71]:
games_2023.head()

Unnamed: 0,game_id,stadium_code,sports_type,game_date,day_of_week,is_holiday,start_time,end_time,home_team_win,match_type,audience,region_code
0,1,DJ03,야구,2023-03-13,월,,NaT,NaT,1,시범경기,,대전 중구
1,2,SO04,야구,2023-03-13,월,,NaT,NaT,0,시범경기,,서울 구로구
2,3,CW01,야구,2023-03-13,월,,NaT,NaT,0,시범경기,,경남 창원시
3,4,BS02,야구,2023-03-13,월,,NaT,NaT,0,시범경기,,부산 동래구
4,5,DG03,야구,2023-03-13,월,,NaT,NaT,1,시범경기,,대구 수성구


# 가져온 데이터에서 필요한 컬럼 추출, 새로운 데이터셋 구축

In [73]:
import pandas as pd

# 1. 연도별 데이터 통합
games_df = pd.concat([games_2023, games_2024], ignore_index=True)
weather_df = pd.concat([weather_2023, weather_2024], ignore_index=True)
accident_df = pd.concat([accident_2023, accident_2024], ignore_index=True)

# 2. 요일 컬럼 추가
games_df['weekday'] = pd.to_datetime(games_df['game_date']).dt.day_name()

# 3. 공휴일 여부 컬럼 추가
holiday_dates = []  
games_df['is_holiday'] = games_df['game_date'].astype(str).isin(holiday_dates)

# 4. 경기 시작 시간 기준으로 가장 가까운 날씨 데이터 JOIN
weather_df['weather_time'] = pd.to_datetime(weather_df['weather_time'])
games_df['start_time'] = pd.to_datetime(games_df['start_time'])

# merge_asof 전에 null 제거
games_df = games_df.dropna(subset=['start_time', 'region_code'])
weather_df = weather_df.dropna(subset=['weather_time', 'region_code'])

weather_df = weather_df.sort_values('weather_time')
games_df = games_df.sort_values('start_time')

merged_df = pd.merge_asof(
    games_df, weather_df,
    left_on='start_time', right_on='weather_time',
    by='region_code',
    direction='backward'
)

# 5. 사고 데이터 JOIN
#accident_df['stadium_code'] = pd.to_datetime(accident_df['stadium_code'])
merged_df = pd.merge(
    merged_df, accident_df,
    left_on=['region_code', 'game_date'],
    right_on=['region_code', 'stadium_code'],
    how='left'
)

# 6. 최종 컬럼 정리
final_df = merged_df[[
    'game_id', 'game_date', 'stadium_code', 'region_code',
    'sports_type', 'match_type', 'home_team_win',
    'start_time', 'accident_count', 
    'temperature', 'precipitation', 'humidity', 'wind_speed',
    'cloud_amount', 'snow_depth', 'visibility',
    'is_holiday', 'weekday'
]]

# 결과 확인
print(final_df.head())
final_df.head()

Empty DataFrame
Columns: [game_id, game_date, stadium_code, region_code, sports_type, match_type, home_team_win, start_time, accident_count, temperature, precipitation, humidity, wind_speed, cloud_amount, snow_depth, visibility, is_holiday, weekday]
Index: []


Unnamed: 0,game_id,game_date,stadium_code,region_code,sports_type,match_type,home_team_win,start_time,accident_count,temperature,precipitation,humidity,wind_speed,cloud_amount,snow_depth,visibility,is_holiday,weekday
