# 0. 사전 세팅

In [None]:
import warnings
warnings.filterwarnings('ignore')

%cd "/content/drive/MyDrive/데이터 분석/projects/ML_portfolio/10_kleague_final_pass_prediction"

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# 7. EDA 인사이트

---

▸ 타겟 구조

    에피소드별 마지막 이벤트는 항상 Pass이고, 그 Pass의 (end_x, end_y)가 타겟
    패스 성공/실패 비율 ≈ 56:44 정도로 아주 심하게 치우치지 않음
    거리 기준으로 보면 10~20m 구간의 패스 성공률이 가장 높고, 30m 이상부터 급격히 떨어짐

    < 인사이트 >
    - 순수 좌표 회귀(MAE/MSE) + 거리 관련 feature를 같이 쓰면 좋을 듯 !
    - multi-task로 distance_bin(0–10 / 10–20 / ...)이나 zone까지 같이 예측하게 하면 representation quality가 좋아질 수 있음

▸ 시퀀스/정렬 구조

    전체 에피소드 중 98% 이상은 시간 순서 정렬이 깔끔하고, 나머지 1.8% 정도만 역전이 있음
    대부분 -0.1 이내의 미세 오차, 진짜 심각한 역전은 적음

    < 인사이트 >
    - 시퀀스 모델은 써도 될 것 같고 !
    - 단, 전처리에서 time_seconds, action_id 기준으로 확실하게 정렬하고, 마스크 처리만 잘 해주면 됨
    - 몇 개 안 되는 진짜 이상치는 드롭하거나 별도 처리해도 전체 성능에 영향 거의 없음

▸ 이벤트 구성 패턴

    전체 이벤트 중 Pass ≈ 50%, Carry ≈ 23%, Turnover 관련(Recovery + Interception + Tackle + Duel) ≈ 15%
    최빈 bigram: Pass→Pass, Carry→Pass, Pass→Carry, Turnover→Pass
    최빈 trigram: Pass-Carry-Pass, Pass-Pass-Pass, Carry-Pass-Pass 등

    < 인사이트 >
    - n-gram 구조가 강해서 TCN(1D conv), RNN, Transformer 모두 잘 맞는 도메인
    - 특히 Carry → Pass, Turnover → Pass 같은 패턴은 final pass 위치와 전술 의도를 암시해줄 수 있음

▸ 마지막 이벤트 직전 패턴

    Final pass 바로 직전 이벤트의 90%가 Carry / Pass / Recovery 셋 중 하나
    평균 end_x/거리 기준으로 보면 Carry / Tackle 직후 패스가 가장 전방, 가장 먼 거리

    < 인사이트 >
    - prev_event_type, prev_event_dx/dy/angle는 무조건 써야하는 feature
    - 심지어 “마지막 3~4 스텝만 따로 뽑아서 쓰는 모델”도 하나의 strong baseline으로 가능할 듯

▸ Episode별 클러스터 (5개 패턴)

    Balanced build-up / 짧은 측면 전개 / 리셋/후퇴 패턴 / 짧은 반대 측면 전개 / 긴 빌드업 (dist_cum 가장 큼)
    마지막 패스 성공률까지 보면 Cluster 4 (Long Build-up)가 최상(~0.63)

    < 인사이트 >
    - episode에 붙는 cluster_id 자체가 embedding으로 사용될 수 있음
    - Mixture-of-experts / cluster-wise head 같은 구조도 고려 가능
    - 최소한 cluster_id를 one-hot 또는 embedding으로 넣으면, “지금의 빌드업 흐름이 어떤 종류인지” 모델이 한 번에 인식

▸ Player별 분석 결과

    선수별 carry_ratio 분포가 그렇게 극단적이지 않음
    선수별 angle_mean, final pass (end_x, end_y) mean 위치도 좁은 범위에 몰려 있고, 뚜렷한 클러스터 구조 없음

    < 인사이트 >
    - player_id embedding은 효과 대비 리스크(차원+노이즈) 가 큼
    - baseline에서는 아예 빼고 시작하는 게 합리적
    - 나중에 여유 있으면 작은 차원(8~16) + strong dropout으로 시험해보는 정도

▸ Episode별 움직임 & 각도 smoothness

    cum_dx, cum_dy로 episode가 전진 위주인지, 좌우 측면 전개인지가 드러남
    angle 변화량 기준으로 보면, 누적 전진량이 클수록 angle이 더 안정(episode가 한 방향으로 쭉 진행)

    < 인사이트 >
    - cum_dx, cum_dy, movement_norm, angle_mean_abs, angle_std는 전술 패턴을 대표하는 episode별 feature
    - final pass의 zone/거리/각도 예측에 직접적인 신호를 줌

▸ Turnover 이후 3-step window

    Turnover 직후 첫 행동은 dx ~ 0 (잡아두기), 그 이후 2~3 step에서 전진/측면 전개가 본격적으로 나타남

    < 인사이트 >
    - “turnover 이후 k-step” 여부를 표시하는 feature가 유용하게 쓰일 수 있음
    - 특히 final pass가 turnover 직후 짧은 시퀀스에서 나오는지, 긴 빌드업 끝에서 나오는지 구분해줄 수 있음

# 8. EDA에 이은 Feature Engineering (함수 정의)

---

▸ 패스 각도(angle)

    angle = arctan((end_y-start_y) / (end_x-start_x))

    풀백은 측면으로 많이 주고, 중앙 미드필더는 전진 패스의 비율 높음
    수비수는 옆으로 주는 패스나 후방 패스의 비중 높음

▸ 패스 진행 거리

    더 먼 패스일수록 progressive chance가 높고, end_x가 강하게 증가하는 패턴을 가짐

▸ event_type 임베딩

    type_name → embedding vector
    result_name → embedding vector

    sequence embedding에 필수적으로 진행해야하는 것

▸ 에피소드에서의 속도(Δx, Δy)

    dx_t = x_t - x_(t-1)
    dy_t = y_t - y_(t-1)

    엔드 투 엔드 모델보다 훨씬 패턴 학습이 잘 됨

    dx > 0 → 오른쪽으로 전진 중
    dy > 0 → 위쪽으로 이동 중
    dy < 0 → 아래쪽으로 이동 중
    dx ≈ 0 → 횡패스 빈도 높음
    dx < 0 → 후방 패스 비율 증가 (안정화)

    1. 한 에피소드에서 dx가 계속 증가한다 ➜ 공격 전개 중 (전진 패스 가능성이 높음)
    2. dy가 크게 증가했다➜ 측면 전개 중 (사이드로 패스가 날아갈 가능성)
    3. dx가 음수로 전환되었다 ➜ 후방 안정화 패스 패턴
    4. dx, dy가 급격히 바뀐다 ➜ 압박을 벗어나기 위한 빠른 전개

## 8.1 공통 Util & 기본 데이터 정렬

### 8.1.1 이벤트 단순화(클러스터링), Zone 함수 (EDA에서 쓰던 것 정리)

In [None]:
from collections import Counter
from math import log2

# 이벤트 타입 단순화하는 함수
def simplify_event(t: str) -> str:
    # Pass 계열
    if t in ["Pass", "Pass_Freekick", "Pass_Corner"]:
        return "Pass"

    # Carry
    if t == "Carry":
        return "Carry"

    # Duel / Turnover 계열
    if t in ["Duel", "Tackle", "Interception", "Recovery"]:
        return "Duel_Turnover"

    # Cross (정확히 Cross만)
    if t == "Cross":
        return "Cross"

    # Shot 계열
    if t.startswith("Shot"):
        return "Shot"

    # Penalty Kick은 Shot 계열로 통합
    if t == "Penalty Kick":
        return "Shot"

    # Clearance
    if t in ["Clearance", "Aerial Clearance"]:
        return "Clearance"

    # GK Action
    if t in ["Catch", "Parry", "Goal Kick", "Keeper Rush-Out"]:
        return "GK_Action"

    # Block / Deflection / Intervention / Hit
    if t in ["Block", "Deflection", "Intervention", "Hit"]:
        return "Deflect_Block"

    # Set-piece
    if t == "Throw-In":
        return "SetPiece"

    # Goal 이벤트
    if t in ["Goal", "Own Goal"]:
        return "Goal_Event"

    # Error 계열
    if t in ["Error", "Out", "Foul", "Foul_Throw", "Handball_Foul", "Offside"]:
        return "Error_Out"

    return "Misc"

# 이벤트 결과 단순화하는 함수
def simplify_result(result_name):
    if result_name in ["Successful", "On Target", "Goal"]:
        return "Success"

    if result_name in ["Unsuccessful", "Off Target", "Blocked"]:
        return "Fail"

    return "None"

# Zone 구분하는 함수
def get_zone_x(x):
    if x < 35: return "D3"
    elif x < 70: return "M3"
    else: return "A3"

def get_zone_y(y):
    if y < 22: return "Left"
    elif y < 45: return "Center"
    else: return "Right"

# 시퀀스(에피소드) 엔트로피 측정하는 함수
def sequence_entropy(seq):
    cnt = Counter(seq)
    total = len(seq)

    if total == 0:
        return 0.0

    probs = [c / total for c in cnt.values()]

    return -sum(p * log2(p) for p in probs if p > 0)

### 8.1.2 기본 정렬 함수

---

    정렬이 이미 되어있는 데이터라 재정렬시키면 깨질 수 있음 - 삭제

In [None]:
# SORT_COLS = ["game_episode", "time_seconds", "action_id"]

# def sort_events(df: pd.DataFrame) -> pd.DataFrame:
#     """
#     time_seconds, action_id 기준으로 episode 내 이벤트 정렬.
#     """
#     df_sorted = df.sort_values(SORT_COLS).reset_index(drop=True)
#     return df_sorted

## 8.2 이벤트별 Feature Engineering

---

    한 이벤트마다 어떤 Feature를 만들지를 담당하는 함수

### 8.2.1 Turnover flag 계산 (EDA에서 쓴 함수)

In [None]:
def add_turnover_flag(df):
    df = df.copy()

    # Fail 정의
    fail = df["result_simple"] == "Fail"

    # Pass / Cross / SetPiece 실패 → turnover
    cond_fail_pass = df["event_simple"].isin(["Pass", "Cross", "SetPiece"]) & fail

    # Take-On 실패
    cond_takeon_fail = (df["type_name"] == "Take-On") & (df["result_name"] == "Unsuccessful")

    # Duel 실패
    cond_duel_fail = (df["type_name"] == "Duel") & (df["result_name"] == "Unsuccessful")

    # 상대가 소유권 획득하는 이벤트
    cond_gain = df["event_simple"] == "Duel_Turnover"

    # Dead ball turnover
    cond_deadball = df["event_simple"] == "Error_Out"

    df["is_turnover"] = (
        cond_fail_pass |
        cond_takeon_fail |
        cond_duel_fail |
        cond_gain |
        cond_deadball
    ).astype(int)

    return df

### 8.2.2 episode 내 좌표 차이 / 시간 차이 등 계산 함수

In [None]:
import numpy as np
import pandas as pd

def add_movement_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    episode 내 start_x, start_y 기준으로 dx, dy, distance, angle, dt 등 추가.
    """
    df = df.copy()
    # df = sort_events(df)

    df["dx"] = df.groupby("game_episode")["start_x"].diff().fillna(0)
    df["dy"] = df.groupby("game_episode")["start_y"].diff().fillna(0)

    df["distance"] = np.sqrt(df["dx"]**2 + df["dy"]**2)

    df["angle"] = np.arctan2(df["dy"], df["dx"]).fillna(0)

    # 시간차 안정화
    dt = df.groupby("game_episode")["time_seconds"].diff()
    dt = dt.fillna(0)
    dt[dt < 0] = 0
    df["dt"] = dt

    # step index
    df["step_idx"] = df.groupby("game_episode").cumcount()
    df["epi_len"] = df.groupby("game_episode")["step_idx"].transform("max") + 1

    df["step_idx_norm"] = df["step_idx"] / df["epi_len"].clip(lower=1)

    # relative time
    t_min = df.groupby("game_episode")["time_seconds"].transform("min")
    t_max = df.groupby("game_episode")["time_seconds"].transform("max")
    df["time_rel"] = (df["time_seconds"] - t_min) / (t_max - t_min).replace(0, 1)

    return df

### 8.2.3 zone / 골 방향 feature 함수

In [None]:
def add_categorical_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 단순화
    df["event_simple"] = df["type_name"].apply(simplify_event)
    df["result_simple"] = df["result_name"].apply(simplify_result)

    # zone
    df["zone_x"] = df["start_x"].apply(get_zone_x)
    df["zone_y"] = df["start_y"].apply(get_zone_y)

    # 골대 기준 거리/각도 (오른쪽 골대 기준)
    goal_x, goal_y = 105.0, 34.0
    df["dist_to_goal"] = np.sqrt((goal_x - df["start_x"])**2 +
                                 (goal_y - df["start_y"])**2)
    goal_angle = np.arctan2(goal_y - df["start_y"],
                            goal_x - df["start_x"])
    df["angle_to_goal"] = goal_angle

    return df

### 8.2.4 Episode 누적 이동량 계산 함수

In [None]:
def add_episode_cumulative_movement(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # df = sort_events(df)

    df["cum_dx"] = df.groupby("game_episode")["dx"].cumsum()
    df["cum_dy"] = df.groupby("game_episode")["dy"].cumsum()
    df["movement_norm"] = np.sqrt(df["cum_dx"]**2 + df["cum_dy"]**2)

    return df

### 8.2.5 최종 적용 함수

In [None]:
def build_event_level_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    train_df나 test_episode_df에 공통 적용할 Event-level Feature Engineering 파이프라인
    """
    df_fe = df.copy()
    # df_fe = sort_events(df_fe)
    df_fe = add_categorical_features(df_fe)
    df_fe = add_turnover_flag(df_fe)
    df_fe = add_movement_features(df_fe)
    df_fe = add_episode_cumulative_movement(df_fe)

    return df_fe

## 8.3 에피소드별 Feature Engineering

### 8.3.1 각도 변화량 요약 함수

In [None]:
def angle_diff(a1, a2):
    diff = a2 - a1
    diff = (diff + np.pi) % (2 * np.pi) - np.pi
    return diff

def compute_angle_smoothness(df: pd.DataFrame) -> pd.DataFrame:
    """
    episode별 angle 변화량 요약 (std, mean_abs, max 등)
    """
    df = df.copy()
    # df = sort_events(df)

    records = []

    for ge, g in df.groupby("game_episode"):
        ang = g["angle"].values
        if len(ang) < 3:
            continue

        diffs = [angle_diff(ang[i], ang[i+1]) for i in range(len(ang) - 1)]

        records.append({
            "game_episode": ge,
            "angle_change_std": np.std(diffs),
            "angle_change_mean_abs": np.mean(np.abs(diffs)),
            "angle_change_max": np.max(np.abs(diffs)),
            "angle_change_N": len(diffs),
        })

    angle_df = pd.DataFrame(records)
    return angle_df

def add_angle_smoothness_to_epi(epi_feat: pd.DataFrame,
                                angle_smooth_df: pd.DataFrame) -> pd.DataFrame:

    res = epi_feat.merge(angle_smooth_df, on="game_episode", how="left")
    # 결측은 0 또는 평균값으로 채워도 됨 (길이가 짧은 에피소드)
    res[["angle_change_std", "angle_change_mean_abs",
         "angle_change_max", "angle_change_N"]] = \
        res[["angle_change_std", "angle_change_mean_abs",
             "angle_change_max", "angle_change_N"]].fillna(0.0)

    return res

### 8.3.2 episode별 요약 함수

In [None]:
def extract_episode_summary(df: pd.DataFrame) -> pd.DataFrame:
    """
    episode별 요약 feature (len, ratio_pass/carry, dx/dy, dist 등)
    """
    df = df.copy()
    # df = sort_events(df)

    feats = []

    for ge, g in df.groupby("game_episode"):
        event_s = g["event_simple"].values

        xs = g["start_x"].values
        ys = g["start_y"].values

        dx = np.diff(xs)
        dy = np.diff(ys)
        dist = np.sqrt(dx*dx + dy*dy)
        angle = np.arctan2(dy, dx)

        len_epi = len(g)

        feats.append({
            "game_episode": ge,
            "epi_len": len_epi,

            # 단순화 버전 적용해서
            "ratio_pass": np.mean(event_s == "Pass"),
            "ratio_carry": np.mean(event_s == "Carry"),
            "ratio_turnover": np.mean(g["is_turnover"].values),

            "dx_mean": dx.mean() if len(dx) else 0,
            "dy_mean": dy.mean() if len(dy) else 0,
            "angle_mean": angle.mean() if len(angle) else 0,
            "angle_std": angle.std() if len(angle) else 0,

            "dist_mean": dist.mean() if len(dist) else 0,
            "dist_cum": dist.sum() if len(dist) else 0,

            "start_zone_x": get_zone_x(xs[0]),
            "start_zone_y": get_zone_y(ys[0]),
        })

    return pd.DataFrame(feats)

### 8.3.3 episode별 event entropy 추가 함수

In [None]:
def add_episode_entropy(df: pd.DataFrame, epi_feat: pd.DataFrame) -> pd.DataFrame:
    """
    episode별 event_simplified entropy 계산 후 epi_feat에 merge
    """
    df = df.copy()
    # df = sort_events(df)

    entropy_records = []
    for ge, g in df.groupby("game_episode"):
        seq = g["event_simple"].tolist()
        ent = sequence_entropy(seq)
        entropy_records.append({"game_episode": ge, "entropy_event": ent})

    ent_df = pd.DataFrame(entropy_records)

    epi_feat = epi_feat.merge(ent_df, on="game_episode", how="left")
    return epi_feat

### 8.3.4 Episode summary 통합 최종 함수

In [None]:
def build_episode_level_features(df_fe: pd.DataFrame) -> pd.DataFrame:
    """
    Event별 FE가 적용된 df_fe를 입력 받아,
    episode별 summary feature를 생성.
    """
    epi_feat = extract_episode_summary(df_fe)
    angle_smooth_df = compute_angle_smoothness(df_fe)
    epi_feat = add_angle_smoothness_to_epi(epi_feat, angle_smooth_df)
    epi_feat = add_episode_entropy(df_fe, epi_feat)

    return epi_feat

---

    일단 Baseline 모델에는
    Event-level에서 event_simple (embedding), result_simple (embedding), is_turnover, dx, dy, distance,
    angle, dt, zone_x/y (embedding), step_idx_norm, time_rel, cum_dx, cum_dy, movement_norm을 Input으로 넣고,

    Episode-level에서는 epi_len, ratio_pass, ratio_carry, ratio_turnover, dx_mean, dy_mean,
    angle_mean, angle_std, dist_cum, dist_mean, angle_smoothness metrics, entropy_event을 Input으로 넣을 듯 !


# 9. 실제 Feature Engineering

In [None]:
import pandas as pd

df = pd.read_csv('Data/train.csv')

In [None]:
df_fe = build_event_level_features(df)
df_fe.head()

In [None]:
df_fe.info()



```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356721 entries, 0 to 356720
Data columns (total 34 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   game_id        356721 non-null  int64  
 1   period_id      356721 non-null  int64  
 2   episode_id     356721 non-null  int64  
 3   time_seconds   356721 non-null  float64
 4   team_id        356721 non-null  int64  
 5   player_id      356721 non-null  int64  
 6   action_id      356721 non-null  int64  
 7   type_name      356721 non-null  object
 8   result_name    216467 non-null  object
 9   start_x        356721 non-null  float64
 10  start_y        356721 non-null  float64
 11  end_x          356721 non-null  float64
 12  end_y          356721 non-null  float64
 13  is_home        356721 non-null  bool   
 14  game_episode   356721 non-null  object
 15  event_simple   356721 non-null  object
 16  result_simple  356721 non-null  object
 17  zone_x         356721 non-null  object
 18  zone_y         356721 non-null  object
 19  dist_to_goal   356721 non-null  float64
 20  angle_to_goal  356721 non-null  float64
 21  is_turnover    356721 non-null  int64  
 22  dx             356721 non-null  float64
 23  dy             356721 non-null  float64
 24  distance       356721 non-null  float64
 25  angle          356721 non-null  float64
 26  dt             356721 non-null  float64
 27  step_idx       356721 non-null  int64  
 28  epi_len        356721 non-null  int64  
 29  step_idx_norm  356721 non-null  float64
 30  time_rel       356721 non-null  float64
 31  cum_dx         356721 non-null  float64
 32  cum_dy         356721 non-null  float64
 33  movement_norm  356721 non-null  float64
dtypes: bool(1), float64(17), int64(9), object(7)
memory usage: 90.2+ MB
```



In [None]:
df_fe[['type_name', 'result_name', 'event_simple', 'result_simple']]

In [None]:
epi_fe = build_episode_level_features(df_fe)
epi_fe.head()

In [None]:
epi_fe.info()



```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15435 entries, 0 to 15434
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   game_episode           15435 non-null  object
 1   epi_len                15435 non-null  int64  
 2   ratio_pass             15435 non-null  float64
 3   ratio_carry            15435 non-null  float64
 4   ratio_turnover         15435 non-null  float64
 5   dx_mean                15435 non-null  float64
 6   dy_mean                15435 non-null  float64
 7   angle_mean             15435 non-null  float64
 8   angle_std              15435 non-null  float64
 9   dist_mean              15435 non-null  float64
 10  dist_cum               15435 non-null  float64
 11  start_zone_x           15435 non-null  object
 12  start_zone_y           15435 non-null  object
 13  angle_change_std       15435 non-null  float64
 14  angle_change_mean_abs  15435 non-null  float64
 15  angle_change_max       15435 non-null  float64
 16  angle_change_N         15435 non-null  float64
 17  entropy_event          15435 non-null  float64
dtypes: float64(14), int64(1), object(3)
memory usage: 2.1+ MB
```



In [None]:
epi_fe.describe().T.round(2)

## 9.1 train_fe 파일 저장

In [None]:
df_train = pd.read_csv("Data/train.csv")
df_fe = build_event_level_features(df_train)
epi_fe = build_episode_level_features(df_fe)

# df_fe.to_csv("Data/train_fe.csv", index=False)
# epi_fe.to_csv("Data/train_epi_fe.csv", index=False)

## 9.2 test_fe 파일 저장

In [None]:
df_test = pd.read_csv('Data/test.csv')
df_test.head()

In [None]:
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

def load_test_episode_opt(path, game_id, game_episode, base_dir="Data"):
    fname = path[1:]
    file_path = base_dir + fname

    df = pd.read_csv(file_path)
    df["game_id"] = game_id
    df["game_episode"] = game_episode

    return df

test_meta = pd.read_csv("Data/test.csv")

test_events = Parallel(n_jobs=-1)(
    delayed(load_test_episode_opt)(p, g, e)
    for p, g, e in tqdm(zip(test_meta["path"], test_meta["game_id"], test_meta["game_episode"]), total=len(test_meta))
)

test_df = pd.concat(test_events, ignore_index=True)

test_fe = build_event_level_features(test_df)
test_epi_fe = build_episode_level_features(test_fe)

test_fe.to_csv("Data/test_fe.csv", index=False)
test_epi_fe.to_csv("Data/test_epi_fe.csv", index=False)

# 10. 모델링

## 10.1 train/valid split

---

    에피소드 단위로 split해야겠다 !

    시퀀스 모델은 episode 전체를 하나의 샘플로 보고 학습하기 때문에 episode를 반으로 쪼개거나 섞으면 temporal dependency가 깨짐

    같은 game_id 안에서 train/valid가 섞이면 데이터 누수 발생
    따라서 game_id 단위로 묶어서 episode 단위 split하는 게 가장 안전

    그 중에서도 Game별로 Split 해야할 것 같은데,
    train 게임과 valid 게임을 완전히 분리하고, 하나의 game_id에 속한 모든 episode는 train 또는 valid 중 하나에만 배정하기

    데이터 누수 0이고, 가장 현실적이라고 판단되기 때문에 이 기법으로 선택

In [None]:
df = pd.read_csv('Data/train.csv')

# 나누기 전에 tail 검증부터 (모두 최종 패스가 나와야 함)
orig_tail = df.groupby("game_episode").tail(1)

orig_tail_types = orig_tail["type_name"].unique()
print(orig_tail_types)



```
['Pass']
```



In [None]:
fe_tail = df_fe.groupby("game_episode").tail(1)
fe_tail_types = fe_tail["type_name"].unique()

print(fe_tail_types)



```
['Pass' 'Carry']가 나온 걸 보아하니.. FE 과정에서 문제가 생긴 것 같아 뜯어고쳐봐야겠다

⭐ 재정렬을 시키면 안 됐었다 !! 정렬 함수 삭제하니 ['Pass']만 나온다.
```



In [None]:
from sklearn.model_selection import train_test_split

'''
게임 단위 split (데이터 누수 방지)
'''
game_ids = df["game_id"].unique()

train_games, valid_games = train_test_split(
    game_ids,
    test_size=0.2,
    random_state=42,
)

'''
에피소드 단위 split 기준 만들기 (split 기준은 반드시 원본 df에서 뽑기)
'''
train_epis = df[df["game_id"].isin(train_games)]["game_episode"].unique()
valid_epis = df[df["game_id"].isin(valid_games)]["game_episode"].unique()

print(f"Train games: {len(train_games)}, Valid games: {len(valid_games)}")
print(f"Train episodes: {len(train_epis)}, Valid episodes: {len(valid_epis)}")

'''
FE 이후 df_fe에서 에피소드 기준으로 데이터 분리
'''
train_df = df_fe[df_fe["game_episode"].isin(train_epis)].copy()
valid_df = df_fe[df_fe["game_episode"].isin(valid_epis)].copy()

'''
Episode tail 검증
'''
train_tail_types = train_df.groupby("game_episode").tail(1)["type_name"].unique()
valid_tail_types = valid_df.groupby("game_episode").tail(1)["type_name"].unique()

print("Train tail types:", train_tail_types)
print("Valid tail types:", valid_tail_types)

assert set(train_tail_types) == {"Pass"}
assert set(valid_tail_types) == {"Pass"}

print("Split integrity confirmed: all episode tails are Pass.")



```
Train games: 158, Valid games: 40
Train episodes: 12389, Valid episodes: 3046
Train tail types: ['Pass']
Valid tail types: ['Pass']
Split integrity confirmed: all episode tails are Pass.
```



In [None]:
print(len(train_epis), len(valid_epis))
print(train_df.shape, valid_df.shape)

    12389 3046
    (285011, 34) (71710, 34)

## 10.2 모델 입력 구조 설계

---

    FE(df_fe)는 “이벤트 1개 = 1 row” 형태고, 모델은 “episode 전체 = 1 sample(sequence)” 형태를 원함

    최종적으로 각 episode는 다음과 같은 tensor로 구성
    - X_seq: 이벤트 시퀀스 feature (T × F)
    - mask: padding mask (T)
    - target_x, target_y: 에피소드 마지막 패스 end_x, end_y
    - (선택) categorical embedding index들
    - (선택) episode별 feature

    일단 baseline은 event-level numeric features만 들어가는 baseline version으로 구성하고,
    categorical embedding은 이후 단계에서 추가해볼 버전에 넣어보든가 하기

    Baseline Input Features

| Feature        | 의미                   |
| -------------- | -------------------- |
| start_x, start_y         | 시작 좌표                  |
| dx, dy         | 이동량                  |
| distance       | 이동 거리                |
| angle          | 이동 방향                |
| dt             | 이벤트 간 시간차            |
| step_idx_norm  | 시퀀스 내 포지션            |
| time_rel       | 상대 시간                |
| cum_dx, cum_dy | 에피소드 누적 이동량          |
| movement_norm  | 누적 이동량 크기            |
| dist_to_goal   | 골대까지 거리              |
| angle_to_goal  | 골대 방향 각도             |
| is_turnover    | 1-step turnover flag |

### 10.2.1 입력 구조 설계 함수 정의

In [None]:
def build_episode_sequences(df):
    """
    df_fe를 episode 단위로 list로 묶어주는 함수.
    output: {game_episode: df_subset}
    """
    episodes = {}

    for ge, g in df.groupby("game_episode"):
        g_sorted = g.sort_values(["time_seconds", "action_id"])
        episodes[ge] = g_sorted.reset_index(drop=True)

    return episodes

def extract_targets(episodes):
    targets = {}

    for ge, g in episodes.items():
        last = g.iloc[-1]
        targets[ge] = (last["end_x"], last["end_y"])

    return targets

# 이벤트 레벨 Numeric Feature
CONT_COLS = [
    "start_x", "start_y",
    "dx", "dy",
    "distance",
    "angle",
    "dt",
    "step_idx_norm",
    "time_rel",
    "cum_dx", "cum_dy",
    "movement_norm",
    "dist_to_goal",
    "angle_to_goal",
    "is_turnover"
]

def episode_to_matrix(g, feature_cols=CONT_COLS):
    """
    한 episode의 df(여러 row) → Numeric Feature Matrix (T × F)
    """
    return g[feature_cols].values.astype("float32")

### 10.2.2 Padding + Attention Mask 생성

In [None]:
def pad_sequence(seq, max_len):
    """
    seq: (T, F)
    return:
      padded_seq: (max_len, F)
      mask: (max_len,)  — 1: 실제 token, 0: padding
    """
    T, F = seq.shape
    pad_len = max_len - T

    # 길이가 짧을 때, 부족한 만큼 0으로 Zero-padding
    # 이때 mask를 만들어 어디까지가 진짜 데이터이고, 어디부터가 0인지 표시(1은 데이터, 0은 패딩)
    if pad_len > 0:
        pad = np.zeros((pad_len, F), dtype="float32")
        padded = np.concatenate([seq, pad], axis=0)
        mask = np.concatenate([np.ones(T), np.zeros(pad_len)])
    # 길이가 길 때, max_len만큼만 자르기
    else:
        padded = seq[:max_len]
        mask = np.ones(max_len)

    return padded, mask.astype("float32")

### 10.2.3 EpisodeDataset (Baseline: Numeric Only)

In [None]:
import torch
from torch.utils.data import Dataset

class EpisodeDataset(Dataset):
    def __init__(self, df, episode_ids, max_len=270, feature_cols=CONT_COLS):
        """
        df: train_fe DataFrame
        episode_ids: 학습 또는 검증에 사용할 episode list
        """
        self.episodes = []
        self.max_len = max_len
        self.feature_cols = feature_cols

        # Episode별 분리
        for ge, g in df[df["game_episode"].isin(episode_ids)].groupby("game_episode"):
            g = g.sort_values(["time_seconds", "action_id"]).reset_index(drop=True)

            seq = episode_to_matrix(g, feature_cols=feature_cols)
            seq_pad, mask = pad_sequence(seq, max_len)

            # Target: 마지막 패스 end_x, end_y
            tx, ty = g["end_x"].iloc[-1], g["end_y"].iloc[-1]
            target = np.array([tx, ty], dtype="float32")

            self.episodes.append((seq_pad, mask, target))

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        x, mask, target = self.episodes[idx]
        return {
            "x": torch.tensor(x),
            "mask": torch.tensor(mask),
            "target": torch.tensor(target),
        }

### 10.2.4 Collate 함수

In [None]:
def collate_fn(batch):
    x_list = [item["x"] for item in batch]
    mask_list = [item["mask"] for item in batch]
    y_list = [item["target"] for item in batch]

    x = torch.stack(x_list)
    mask = torch.stack(mask_list)
    y = torch.stack(y_list)

    return x, mask, y

## 10.3 BiLSTM 모델 (Only Numeric)

---

    RNN 계열은 시퀀스 데이터 처리의 기본 골격 / 기존 연구들에서도 경기 이벤트, 스포츠 시계열 데이터에 LSTM / BiLSTM 사용 사례 많음

    특히 양방향 BiLSTM은 앞뒤 문맥 모두 고려 가능, 빌드업 전체 흐름을 학습하기에 적합. 실제로 최근 축구 이벤트 기반 분석에서도 활용된 사례 존재

[활용 사례](https://www.mdpi.com/2079-9292/13/20/4105?utm_source=chatgpt.com)

    Transformer나 복잡한 구조는 이후 확장 후보로 두고, 먼저 “단순 + 안정 + 빠른 실험”을 위해 BiLSTM이 이상적

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class BiLSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, dropout=0.2):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )

        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 2)  # end_x, end_y
        )

    def forward(self, x, mask):
        """
        x: (B, T, F)
        mask: (B, T) – 여기서는 사용하지 않지만 확장 가능
        """
        lengths = mask.sum(dim=1).long()
        lengths_sorted, sort_idx = lengths.sort(descending=True)

        x_sorted = x[sort_idx]

        packed = pack_padded_sequence(
            x_sorted,
            lengths_sorted.cpu(),
            batch_first=True,
            enforce_sorted=True
        )

        _, (h_n, _) = self.lstm(packed)

        # Forward, Backward concat
        h_fwd = h_n[-2]
        h_bwd = h_n[-1]
        h = torch.cat([h_fwd, h_bwd], dim=-1)

        # 원래 배치 순서로 되돌리기
        _, inv_idx = sort_idx.sort()
        h = h[inv_idx]

        out = self.fc(h)
        return out

    test.csv에서도 동일한 encoder를 사용해야 하므로, encoders는 pickle로 저장해두고 inference에서 다시 load해야 함

### 10.3.1 Dataset 생성 & DataLoader 구성

In [None]:
from torch.utils.data import DataLoader

train_dataset = EpisodeDataset(df_fe, train_epis, max_len=270)
valid_dataset = EpisodeDataset(df_fe, valid_epis, max_len=270)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

### 10.3.2 Train Loop

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = BiLSTMRegressor(input_dim=len(CONT_COLS)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

def euclidean(pred, target):
    return torch.sqrt(((pred - target)**2).sum(dim=1)).mean().item()

EPOCHS = 20

for epoch in range(EPOCHS):
    # ----------------------- Train -----------------------
    model.train()
    train_loss = 0

    for x, mask, y in train_loader:
        x, mask, y = x.to(device), mask.to(device), y.to(device)

        pred = model(x, mask)
        loss = criterion(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # ----------------------- Valid -----------------------
    model.eval()
    val_loss = 0
    val_dist = 0

    with torch.no_grad():
        for x, mask, y in valid_loader:
            x, mask, y = x.to(device), mask.to(device), y.to(device)

            pred = model(x, mask)
            loss = criterion(pred, y)

            val_loss += loss.item()
            val_dist += euclidean(pred, y)

    print(f"[Epoch {epoch}] "
          f"TrainLoss={train_loss:.4f} | "
          f"ValidLoss={val_loss:.4f} | "
          f"Euclidean={val_dist:.4f}")

torch.save(model.state_dict(), "model.pt")



```
[Epoch 11] TrainLoss=80387.3156 | ValidLoss=18447.2292 | Euclidean=1604.0103
[Epoch 12] TrainLoss=80371.7264 | ValidLoss=18295.6875 | Euclidean=1599.7318
[Epoch 13] TrainLoss=79303.6462 | ValidLoss=19201.0154 | Euclidean=1617.4109
[Epoch 14] TrainLoss=79285.7698 | ValidLoss=18494.2226 | Euclidean=1570.6561 ⭐
[Epoch 15] TrainLoss=78982.6663 | ValidLoss=18193.1013 | Euclidean=1585.8891
[Epoch 16] TrainLoss=78339.7959 | ValidLoss=18259.0561 | Euclidean=1594.4622
[Epoch 17] TrainLoss=78466.6491 | ValidLoss=18590.7915 | Euclidean=1601.3634
[Epoch 18] TrainLoss=77232.0764 | ValidLoss=18761.6560 | Euclidean=1599.2085
[Epoch 19] TrainLoss=78248.9732 | ValidLoss=18124.3916 | Euclidean=1578.8435
```
    모델 성능이 매우 안 좋은 것을 확인

    Categorical context(이벤트 타입, zone 등)가 빠져 있어서, 정보가 크게 손실된 상태고,
    Padding mask는 있지만 attention mechanism이 없음 !! LSTM이 front-loaded됨

    Positional Encoding(better time embedding)도 없음

    따라서 아래 절차처럼 한 번 모델 하나하나씩 구현해보자

    1. Categorical Embedding 추가 ⭐
    2. Episode-level Feature 추가
    3. BiLSTM → BiLSTM + Attention(Luong/Scaled Dot) 업그레이드
    4. Transformer Baseline 추가
    5. multi-task 학습: zone_x_bin, zone_y_bin 또한 같이 예측
    6. 좌표 normalization 도입
    7. loss 개선: MAE + custom distance loss 혼합


### 10.3.3 Inference

In [None]:
df_test_fe = pd.read_csv("Data/test_fe.csv")
test_episodes = df_test_fe["game_episode"].unique()

test_dataset = EpisodeDataset(df_test_fe, test_episodes, max_len=270)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

model = BiLSTMRegressor(input_dim=len(CONT_COLS)).to(device)
model.load_state_dict(torch.load("model.pt", map_location=device))
model.eval()

preds = []

with torch.no_grad():
    for x, mask, _ in test_loader:
        x, mask = x.to(device), mask.to(device)
        out = model(x, mask)
        preds.append(out.cpu().numpy()[0])

preds = np.array(preds)

sample = pd.read_csv("Data/sample_submission.csv")
sample["end_x"] = preds[:,0]
sample["end_y"] = preds[:,1]

sample.to_csv("Data/submission_1.csv", index=False)
print("Saved submission.csv")

## 10.4 BiLSTM+

---

    추가 Feature들

| Column        | 의미                             |
| ------------- | ------------------------------ |
| event_simple  | Pass / Carry / Duel_Turnover 등 |
| result_simple | Success / Fail                 |
| zone_x        | D3 / M3 / A3                   |
| zone_y        | Left / Center / Right          |
| is_home       | home/away 구분                   |


### 10.4.1 Label Encoder 만들기

---

    train_fe 전체를 기준으로 모든 categorical vocabulary를 결정

In [None]:
# 이벤트 레벨 Categorical Feature
CAT_COLS = ["event_simple", "result_simple", "zone_x", "zone_y", "is_home"]

# 이 값들을 label encoding → embedding lookup → LSTM input에 concat
import pickle

def build_label_encoders(df):
    encoders = {}
    num_classes = {}

    for col in CAT_COLS:
        uniques = sorted(df[col].dropna().unique())
        encoders[col] = {u: i for i, u in enumerate(uniques)}
        num_classes[col] = len(uniques)

    return encoders, num_classes

encoders, num_classes = build_label_encoders(df_fe)

# 저장
with open("Data/encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

with open("Data/num_classes.pkl", "wb") as f:
    pickle.dump(num_classes, f)

### 10.4.2 EpisodeDataset을 categorical embedding과 함께 구축

In [None]:
import torch
from torch.utils.data import Dataset

class EpisodeDataset(Dataset):
    def __init__(self, df, episode_ids, max_len=270,
                 cont_cols=CONT_COLS, cat_cols=CAT_COLS, encoders=None):

        self.max_len = max_len
        self.cont_cols = cont_cols
        self.cat_cols = cat_cols
        self.encoders = encoders
        self.episodes = []

        sub = df[df["game_episode"].isin(episode_ids)]

        for ge, g in sub.groupby("game_episode"):
            g = g.sort_values(["time_seconds", "action_id"]).reset_index(drop=True)

            # 1) Numeric
            seq_cont = g[cont_cols].values.astype("float32")

            # 2) Categorical → label encoding
            seq_cat = []
            for col in cat_cols:
                seq_cat.append(g[col].map(encoders[col]).fillna(0).astype(int).values)
            seq_cat = np.vstack(seq_cat).T  # (T, C)

            # padding
            seq_pad_cont, mask = pad_sequence(seq_cont, max_len)

            pad_len = max_len - len(seq_cont)
            if pad_len > 0:
                pad_cat = np.zeros((pad_len, seq_cat.shape[1]), dtype="int64")
                seq_pad_cat = np.vstack([seq_cat, pad_cat])
            else:
                seq_pad_cat = seq_cat[:max_len]

            # target
            tx, ty = g["end_x"].iloc[-1], g["end_y"].iloc[-1]
            target = np.array([tx, ty], dtype="float32")

            self.episodes.append((
                torch.tensor(seq_pad_cont),
                torch.tensor(seq_pad_cat),
                torch.tensor(mask),
                torch.tensor(target)
            ))

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        return self.episodes[idx]

### 10.4.3 Collate 함수

In [None]:
def collate_fn(batch):
    x_cont_list = []
    x_cat_list = []
    mask_list = []
    y_list = []

    for x_cont, x_cat, mask, target in batch:
        x_cont_list.append(x_cont)
        x_cat_list.append(x_cat)
        mask_list.append(mask)
        y_list.append(target)

    x_cont = torch.stack(x_cont_list)
    x_cat = torch.stack(x_cat_list)
    mask = torch.stack(mask_list)
    y = torch.stack(y_list)

    return x_cont, x_cat, mask, y

### 10.4.4 Embedding + LSTM + FC 구조로 모델 구축

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class BiLSTMWithCat(nn.Module):
    def __init__(self, cont_dim, num_classes_dict,
                 lstm_hidden=128, lstm_layers=1, dropout=0.2):

        super().__init__()

        self.cat_cols = list(num_classes_dict.keys())

        # 각 카테고리마다 embedding layer 생성
        self.emb_layers = nn.ModuleDict({
            col: nn.Embedding(num_classes_dict[col], min(16, (num_classes_dict[col]+1)//2))
            for col in self.cat_cols
        })

        emb_total_dim = sum(min(16, (num_classes_dict[c] + 1) // 2) for c in self.cat_cols)

        input_dim = cont_dim + emb_total_dim

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Sequential(
            nn.Linear(lstm_hidden * 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 2)  # end_x, end_y
        )

    def forward(self, x_cont, x_cat, mask):
        # embedding lookup: (B,T,C) → (B,T,emb_dim)
        emb_list = []
        for i, col in enumerate(self.cat_cols):
            emb_list.append(self.emb_layers[col](x_cat[:, :, i]))
        x_emb = torch.cat(emb_list, dim=-1)

        # concat numeric + embedding
        x = torch.cat([x_cont, x_emb], dim=-1)

        # pack sequence
        lengths = mask.sum(dim=1).long()
        lengths_sorted, sort_idx = lengths.sort(descending=True)
        x_sorted = x[sort_idx]

        packed = pack_padded_sequence(
            x_sorted,
            lengths_sorted.cpu(),
            batch_first=True,
            enforce_sorted=True
        )

        _, (h_n, _) = self.lstm(packed)

        h_fwd = h_n[-2]
        h_bwd = h_n[-1]
        h = torch.cat([h_fwd, h_bwd], dim=-1)

        _, inv_idx = sort_idx.sort()
        h = h[inv_idx]

        return self.fc(h)

### 10.4.5 DataLoader 준비

In [None]:
train_dataset = EpisodeDataset(df_fe, train_epis, max_len=270,
                               cont_cols=CONT_COLS, cat_cols=CAT_COLS, encoders=encoders)

valid_dataset = EpisodeDataset(df_fe, valid_epis, max_len=270,
                               cont_cols=CONT_COLS, cat_cols=CAT_COLS, encoders=encoders)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

### 10.4.6 Train Loop

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

model = BiLSTMWithCat(
    cont_dim=len(CONT_COLS),
    num_classes_dict=num_classes
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

def euclidean(pred, target):
    return torch.sqrt(((pred - target)**2).sum(dim=1)).mean().item()

EPOCHS = 50

for epoch in range(EPOCHS):
    # ----------------------- Train -----------------------
    model.train()
    train_loss = 0

    for x_cont, x_cat, mask, y in train_loader:
        x_cont = x_cont.to(device)
        x_cat = x_cat.to(device)
        mask = mask.to(device)
        y = y.to(device)

        pred = model(x_cont, x_cat, mask)
        loss = criterion(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # ----------------------- Valid -----------------------
    model.eval()
    val_loss = 0
    val_dist = 0

    with torch.no_grad():
        for x_cont, x_cat, mask, y in valid_loader:
            x_cont = x_cont.to(device)
            x_cat = x_cat.to(device)
            mask = mask.to(device)
            y = y.to(device)

            pred = model(x_cont, x_cat, mask)
            loss = criterion(pred, y)

            val_loss += loss.item()
            val_dist += euclidean(pred, y)

    print(
        f"[Epoch {epoch}] "
        f"TrainLoss={train_loss:.4f} | "
        f"ValidLoss={val_loss:.4f} | "
        f"Euclid={val_dist:.4f}"
    )

torch.save(model.state_dict(), "model.pt")
print("Saved model!")



```
[Epoch 20] TrainLoss=70213.4230 | ValidLoss=16892.3024 | Euclid=1510.8061
[Epoch 21] TrainLoss=70222.9263 | ValidLoss=16957.8709 | Euclid=1500.0777
[Epoch 22] TrainLoss=70127.4421 | ValidLoss=17092.4315 | Euclid=1522.6829
[Epoch 23] TrainLoss=69660.9905 | ValidLoss=16847.7885 | Euclid=1509.6669
[Epoch 24] TrainLoss=68723.5326 | ValidLoss=16813.4463 | Euclid=1511.7533
[Epoch 25] TrainLoss=68966.2243 | ValidLoss=16857.4438 | Euclid=1491.5315
[Epoch 26] TrainLoss=68657.3103 | ValidLoss=17128.0384 | Euclid=1532.2479
[Epoch 27] TrainLoss=68148.0404 | ValidLoss=16771.6007 | Euclid=1477.3296
[Epoch 28] TrainLoss=67543.9454 | ValidLoss=16825.2638 | Euclid=1496.1501
[Epoch 29] TrainLoss=66826.2477 | ValidLoss=16519.7088 | Euclid=1479.4009
```



## 10.5 마지막 3-step 추가한 버전으로 한 번 더

---

    이번에 추가할 Feature

| Feature                      | 의미      |
| ---------------------------- | ------- |
| dx_last1, dx_last2, dx_last3 | 마지막 이동량 |
| dy_last1, dy_last2, dy_last3 | 마지막 이동량 |
| dist_last1..3                | 마지막 거리  |
| angle_last1..3               | 마지막 변화각 |
| event_simple_last1..3        | 이벤트 종류  |
| result_simple_last1..3       | 성공/실패   |

In [None]:
def add_last_k_features(df, k=3):
    df = df.copy()

    records = []

    for ge, g in df.groupby("game_episode"):
        g = g.reset_index(drop=True)

        # 마지막 k개의 row만 추출
        last_rows = g.tail(k)
        last_rows = last_rows.reset_index(drop=True)

        rec = {"game_episode": ge}

        # numeric last-k
        for i in range(k):
            if i < len(last_rows):
                rec[f"dx_last{i+1}"] = last_rows.loc[i, "dx"]
                rec[f"dy_last{i+1}"] = last_rows.loc[i, "dy"]
                rec[f"dist_last{i+1}"] = last_rows.loc[i, "distance"]
                rec[f"angle_last{i+1}"] = last_rows.loc[i, "angle"]
            else:
                rec[f"dx_last{i+1}"] = 0
                rec[f"dy_last{i+1}"] = 0
                rec[f"dist_last{i+1}"] = 0
                rec[f"angle_last{i+1}"] = 0

        # categorical last-k
        for i in range(k):
            if i < len(last_rows):
                rec[f"event_last{i+1}"] = last_rows.loc[i, "event_simple"]
                rec[f"result_last{i+1}"] = last_rows.loc[i, "result_simple"]
            else:
                rec[f"event_last{i+1}"] = "None"
                rec[f"result_last{i+1}"] = "None"

        records.append(rec)

    return pd.DataFrame(records)

In [None]:
epi_last3 = add_last_k_features(df_fe, k=3)

epi_full = epi_fe.merge(epi_last3, on="game_episode", how="left")
epi_full.to_csv("Data/train_epi_full.csv", index=False)
epi_full.head()

### 10.5.1 추가 컬럼 정의

In [None]:
# episode-level numeric feature (epi_full 기준 컬럼들)
EPI_COLS = [
    "epi_len",
    "ratio_pass", "ratio_carry", "ratio_turnover",
    "dx_mean", "dy_mean",
    "angle_mean", "angle_std",
    "dist_mean", "dist_cum",
    "angle_change_std", "angle_change_mean_abs",
    "angle_change_max", "angle_change_N",
    "entropy_event",

    # last-3 step numeric 요약
    "dx_last1", "dy_last1", "dist_last1", "angle_last1",
    "dx_last2", "dy_last2", "dist_last2", "angle_last2",
    "dx_last3", "dy_last3", "dist_last3", "angle_last3",
]

### 10.5.2 EpisodeHybridDataset

In [None]:
import torch
from torch.utils.data import Dataset

class EpisodeHybridDataset(Dataset):
    def __init__(
        self,
        df_fe,         # 이벤트 레벨 FE (train_fe.csv or test_fe.csv)
        epi_full,      # 에피소드 레벨 FE (train_epi_full.csv or test_epi_full.csv)
        episode_ids,   # 사용할 episode 리스트
        encoders,
        max_len=270,
        cont_cols=CONT_COLS,
        cat_cols=CAT_COLS,
        epi_cols=EPI_COLS,
        has_target=True,
    ):
        self.max_len = max_len
        self.cont_cols = cont_cols
        self.cat_cols = cat_cols
        self.epi_cols = epi_cols
        self.encoders = encoders
        self.has_target = has_target

        # episode-level feature는 index를 game_episode로 설정해두면 lookup이 빠름
        self.epi_full = epi_full.set_index("game_episode")

        self.episodes = []
        sub = df_fe[df_fe["game_episode"].isin(episode_ids)]

        for ge, g in sub.groupby("game_episode"):
            g = g.sort_values(["time_seconds", "action_id"]).reset_index(drop=True)

            # 1) sequence numeric
            seq_cont = g[cont_cols].values.astype("float32")

            # 2) sequence categorical → index
            seq_cat = []
            for col in cat_cols:
                seq_cat.append(
                    g[col].map(encoders[col]).fillna(0).astype(int).values
                )
            seq_cat = np.vstack(seq_cat).T  # (T, C)

            # 3) padding
            seq_pad_cont, mask = pad_sequence(seq_cont, max_len)

            pad_len = max_len - len(seq_cont)
            if pad_len > 0:
                pad_cat = np.zeros((pad_len, seq_cat.shape[1]), dtype="int64")
                seq_pad_cat = np.vstack([seq_cat, pad_cat])
            else:
                seq_pad_cat = seq_cat[:max_len]

            # 4) episode-level feature
            epi_vec = self.epi_full.loc[ge, epi_cols].values.astype("float32")

            # 5) target (train/valid에서만 유효)
            if has_target:
                tx, ty = g["end_x"].iloc[-1], g["end_y"].iloc[-1]
                target = np.array([tx, ty], dtype="float32")
            else:
                target = np.array([0.0, 0.0], dtype="float32")  # dummy

            self.episodes.append((
                torch.tensor(seq_pad_cont),   # (T, F_cont)
                torch.tensor(seq_pad_cat),    # (T, C_cat)
                torch.tensor(mask),           # (T,)
                torch.tensor(epi_vec),        # (F_epi,)
                torch.tensor(target)          # (2,)
            ))

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        return self.episodes[idx]

### 10.5.3 Collate 함수

In [None]:
def collate_hybrid(batch):
    x_cont_list = []
    x_cat_list = []
    mask_list = []
    epi_list = []
    y_list = []

    for x_cont, x_cat, mask, epi_vec, target in batch:
        x_cont_list.append(x_cont)
        x_cat_list.append(x_cat)
        mask_list.append(mask)
        epi_list.append(epi_vec)
        y_list.append(target)

    x_cont = torch.stack(x_cont_list)  # (B, T, F_cont)
    x_cat = torch.stack(x_cat_list)    # (B, T, C_cat)
    mask = torch.stack(mask_list)      # (B, T)
    epi = torch.stack(epi_list)        # (B, F_epi)
    y = torch.stack(y_list)            # (B, 2)

    return x_cont, x_cat, mask, epi, y

### 10.5.4 HybridBiLSTM

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class HybridBiLSTMWithCat(nn.Module):
    def __init__(self, cont_dim, num_classes_dict,
                 epi_feat_dim, lstm_hidden=128, lstm_layers=1, dropout=0.2):
        super().__init__()

        self.cat_cols = list(num_classes_dict.keys())

        # 카테고리별 embedding layer
        self.emb_layers = nn.ModuleDict({
            col: nn.Embedding(num_classes_dict[col], min(16, (num_classes_dict[col] + 1) // 2))
            for col in self.cat_cols
        })

        emb_dim = sum(min(16, (n + 1) // 2) for n in num_classes_dict.values())
        input_dim = cont_dim + emb_dim

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Sequential(
            nn.Linear(lstm_hidden * 2 + epi_feat_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 2)  # end_x, end_y
        )

    def forward(self, x_cont, x_cat, mask):
        """
        x_cont: (B, T, F_cont)
        x_cat:  (B, T, C_cat)
        mask:   (B, T)
        """
        # 1) embedding lookup
        emb_list = []
        for i, col in enumerate(self.cat_cols):
            emb_list.append(self.emb_layers[col](x_cat[:, :, i]))
        x_emb = torch.cat(emb_list, dim=-1)  # (B, T, emb_dim)

        # 2) concat numeric + embedding
        x = torch.cat([x_cont, x_emb], dim=-1)  # (B, T, input_dim)

        # 3) pack sequence
        lengths = mask.sum(dim=1).long()
        lengths_sorted, sort_idx = lengths.sort(descending=True)
        x_sorted = x[sort_idx]

        packed = pack_padded_sequence(
            x_sorted,
            lengths_sorted.cpu(),
            batch_first=True,
            enforce_sorted=True
        )

        _, (h_n, _) = self.lstm(packed)

        # bidirectional → 마지막 layer의 forward/backward hidden state
        h_fwd = h_n[-2]
        h_bwd = h_n[-1]
        h = torch.cat([h_fwd, h_bwd], dim=-1)  # (B, 2*lstm_hidden)

        # 원래 순서로 되돌리기
        _, inv_idx = sort_idx.sort()
        h = h[inv_idx]

        return h  # FC 전에 episde feature와 concat할 raw representation

### 10.5.5 DataLoader 준비

In [None]:
df_fe = pd.read_csv("Data/train_fe.csv")
epi_full = pd.read_csv("Data/train_epi_full.csv")

with open("Data/encoders.pkl", "rb") as f:
    encoders = pickle.load(f)
with open("Data/num_classes.pkl", "rb") as f:
    num_classes = pickle.load(f)

MAX_LEN = 270

train_dataset = EpisodeHybridDataset(
    df_fe=df_fe,
    epi_full=epi_full,
    episode_ids=train_epis,
    encoders=encoders,
    max_len=MAX_LEN,
    cont_cols=CONT_COLS,
    cat_cols=CAT_COLS,
    epi_cols=EPI_COLS,
    has_target=True,
)

valid_dataset = EpisodeHybridDataset(
    df_fe=df_fe,
    epi_full=epi_full,
    episode_ids=valid_epis,
    encoders=encoders,
    max_len=MAX_LEN,
    cont_cols=CONT_COLS,
    cat_cols=CAT_COLS,
    epi_cols=EPI_COLS,
    has_target=True,
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  collate_fn=collate_hybrid)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_hybrid)

### 10.5.6 Train Loop

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

epi_feat_dim = len(EPI_COLS)

backbone = HybridBiLSTMWithCat(
    cont_dim=len(CONT_COLS),
    num_classes_dict=num_classes,
    epi_feat_dim=epi_feat_dim,   # 여기서는 FC 밖에서 concat할 거라 안 써도 되지만, 일단 유지
).to(device)

# FC를 밖에서 한 번 더 감싼 구조로 가자
fc_head = nn.Sequential(
    nn.Linear(2 * 128 + epi_feat_dim, 256),  # hidden_dim=128 가정
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 2)
).to(device)

params = list(backbone.parameters()) + list(fc_head.parameters())
optimizer = torch.optim.AdamW(params, lr=1e-3)
criterion = nn.MSELoss()

def euclidean(pred, target):
    return torch.sqrt(((pred - target) ** 2).sum(dim=1)).mean().item()

EPOCHS = 50

for epoch in range(EPOCHS):
    # ---------- Train ----------
    backbone.train()
    fc_head.train()
    train_loss = 0.0

    for x_cont, x_cat, mask, epi_vec, y in train_loader:
        x_cont = x_cont.to(device)
        x_cat  = x_cat.to(device)
        mask   = mask.to(device)
        epi_vec = epi_vec.to(device)
        y      = y.to(device)

        h = backbone(x_cont, x_cat, mask)      # (B, 2*hidden)
        z = torch.cat([h, epi_vec], dim=1)     # (B, 2*hidden + epi_feat_dim)
        pred = fc_head(z)                      # (B, 2)

        loss = criterion(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # ---------- Valid ----------
    backbone.eval()
    fc_head.eval()
    val_loss = 0.0
    val_dist = 0.0

    with torch.no_grad():
        for x_cont, x_cat, mask, epi_vec, y in valid_loader:
            x_cont = x_cont.to(device)
            x_cat  = x_cat.to(device)
            mask   = mask.to(device)
            epi_vec = epi_vec.to(device)
            y      = y.to(device)

            h = backbone(x_cont, x_cat, mask)
            z = torch.cat([h, epi_vec], dim=1)
            pred = fc_head(z)

            loss = criterion(pred, y)
            val_loss += loss.item()
            val_dist += euclidean(pred, y)

    print(
        f"[Epoch {epoch}] "
        f"TrainLoss={train_loss:.4f} | "
        f"ValidLoss={val_loss:.4f} | "
        f"Euclid={val_dist:.4f}"
    )

torch.save({
    "backbone": backbone.state_dict(),
    "fc_head": fc_head.state_dict(),
}, "Data/hybrid_model.pt")

print("Saved hybrid_model.pt")



```
[Epoch 39] TrainLoss=59771.0182 | ValidLoss=17011.9001 | Euclid=1487.1916 ⭐
[Epoch 40] TrainLoss=59688.4913 | ValidLoss=17203.0105 | Euclid=1512.8117
[Epoch 41] TrainLoss=59036.6915 | ValidLoss=17134.9613 | Euclid=1493.1779
[Epoch 42] TrainLoss=57437.4887 | ValidLoss=17217.3898 | Euclid=1501.6665
[Epoch 43] TrainLoss=58409.0617 | ValidLoss=17064.9690 | Euclid=1507.6640
[Epoch 44] TrainLoss=57272.1718 | ValidLoss=17342.1851 | Euclid=1502.4958
[Epoch 45] TrainLoss=57427.6512 | ValidLoss=18147.3226 | Euclid=1552.6433
[Epoch 46] TrainLoss=56295.2120 | ValidLoss=17060.9277 | Euclid=1494.9201
[Epoch 47] TrainLoss=55884.3484 | ValidLoss=17310.6341 | Euclid=1499.5079
[Epoch 48] TrainLoss=54851.3513 | ValidLoss=18033.2919 | Euclid=1529.1722
[Epoch 49] TrainLoss=54920.3710 | ValidLoss=17347.7805 | Euclid=1508.3866
```



## 10.6 Transformer Encoder Baseline

### 10.6.1 Padding & Dataset

In [None]:
import numpy as np

def pad_sequence_cont_cat(seq_cont, seq_cat, max_len):
    """
    seq_cont: (T, F_cont)
    seq_cat:  (T, C_cat)
    return:
      padded_cont: (max_len, F_cont)
      padded_cat:  (max_len, C_cat)
      mask:        (max_len,)  1: valid, 0: pad
    """
    T = seq_cont.shape[0]
    pad_len = max_len - T

    if pad_len > 0:
        pad_cont = np.zeros((pad_len, seq_cont.shape[1]), dtype="float32")
        pad_cat = np.zeros((pad_len, seq_cat.shape[1]), dtype="int64")
        padded_cont = np.vstack([seq_cont, pad_cont])
        padded_cat = np.vstack([seq_cat, pad_cat])
        mask = np.concatenate([np.ones(T), np.zeros(pad_len)])
    else:
        padded_cont = seq_cont[:max_len]
        padded_cat = seq_cat[:max_len]
        mask = np.ones(max_len)

    return padded_cont.astype("float32"), padded_cat.astype("int64"), mask.astype("float32")

def build_label_encoders(df):
    encoders = {}
    num_classes = {}

    for col in CAT_COLS:
        uniques = sorted(df[col].dropna().unique())
        encoders[col] = {u: (i+1) for i, u in enumerate(uniques)}  # PAD=0 reserved
        num_classes[col] = len(uniques) + 1

    return encoders, num_classes

In [None]:
from torch.utils.data import Dataset

class EpisodeDatasetWithCat(Dataset):
    """
    train / valid용 Episode Dataset (target 포함)
    """
    def __init__(
        self,
        df,
        episode_ids,
        max_len=270,
        cont_cols=CONT_COLS,
        cat_cols=CAT_COLS,
        encoders=None,
    ):
        self.max_len = max_len
        self.cont_cols = cont_cols
        self.cat_cols = cat_cols
        self.encoders = encoders
        self.episodes = []

        sub = df[df["game_episode"].isin(episode_ids)]

        for ge, g in sub.groupby("game_episode"):
            g = g.sort_values(["time_seconds", "action_id"]).reset_index(drop=True)

            # 1) numeric
            seq_cont = g[cont_cols].values.astype("float32")

            # 2) categorical → label index
            seq_cat_list = []
            for col in cat_cols:
                seq_cat_list.append(
                    g[col].map(encoders[col]).fillna(0).astype(int).values
                )
            seq_cat = np.vstack(seq_cat_list).T  # (T, C)

            # 3) padding
            pad_cont, pad_cat, mask = pad_sequence_cont_cat(seq_cont, seq_cat, max_len)

            # 4) target (마지막 패스 end_x, end_y)
            tx, ty = g["end_x"].iloc[-1], g["end_y"].iloc[-1]
            target = np.array([tx, ty], dtype="float32")

            self.episodes.append(
                (
                    torch.tensor(pad_cont),    # (T, F_cont)
                    torch.tensor(pad_cat),     # (T, C_cat)
                    torch.tensor(mask),        # (T,)
                    torch.tensor(target),      # (2,)
                )
            )

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        return self.episodes[idx]

In [None]:
def collate_fn_cat(batch):
    x_cont_list = []
    x_cat_list = []
    mask_list = []
    y_list = []

    for x_cont, x_cat, mask, target in batch:
        x_cont_list.append(x_cont)
        x_cat_list.append(x_cat)
        mask_list.append(mask)
        y_list.append(target)

    x_cont = torch.stack(x_cont_list)   # (B, T, F_cont)
    x_cat  = torch.stack(x_cat_list)    # (B, T, C_cat)
    mask   = torch.stack(mask_list)     # (B, T)
    y      = torch.stack(y_list)        # (B, 2)

    return x_cont, x_cat, mask, y

In [None]:
from torch.utils.data import DataLoader

MAX_LEN = 270

train_dataset = EpisodeDatasetWithCat(
    df_fe,
    train_epis,
    max_len=MAX_LEN,
    cont_cols=CONT_COLS,
    cat_cols=CAT_COLS,
    encoders=encoders,
)

valid_dataset = EpisodeDatasetWithCat(
    df_fe,
    valid_epis,
    max_len=MAX_LEN,
    cont_cols=CONT_COLS,
    cat_cols=CAT_COLS,
    encoders=encoders,
)

train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn_cat
)
valid_loader = DataLoader(
    valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_cat
)

### 10.6.2 Sinusoidal Positional Encoding

In [None]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=300):
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # (T, D)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (T,1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # (1, T, D)
        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        x: (B, T, D)
        """
        T = x.size(1)
        x = x + self.pe[:, :T, :]
        return x

### 10.6.3 Transformer Encoder + Cat Embedding

In [None]:
class TransformerFinalPassRegressor(nn.Module):
    def __init__(
        self,
        cont_dim,
        num_classes_dict,
        d_model=128,
        nhead=4,
        num_layers=2,
        dim_feedforward=256,
        dropout=0.2,
        max_len=300,
    ):
        super().__init__()

        self.cat_cols = list(num_classes_dict.keys())

        # 1) 각 categorical feature embedding
        self.emb_layers = nn.ModuleDict()
        emb_dims = []
        for col, n in num_classes_dict.items():
            emb_dim = min(16, (n + 1) // 2)
            self.emb_layers[col] = nn.Embedding(n, emb_dim)
            emb_dims.append(emb_dim)

        self.emb_total_dim = sum(emb_dims)

        # 2) numeric + embedding concat → d_model로 projection
        self.input_proj = nn.Linear(cont_dim + self.emb_total_dim, d_model)

        # 3) positional encoding
        self.pos_encoder = PositionalEncoding(d_model, max_len=max_len)

        # 4) Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,  # (B, T, D)
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
        )

        # 5) 출력 헤드 (masked mean pooling → 회귀)
        self.fc = nn.Sequential(
            nn.Linear(d_model, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 2),  # end_x, end_y
        )

    def forward(self, x_cont, x_cat, mask):
        """
        x_cont: (B, T, F_cont)
        x_cat:  (B, T, C_cat)  (각 column이 index)
        mask:   (B, T)         1: valid, 0: pad
        """
        B, T, _ = x_cont.size()

        # 1) categorical embedding
        emb_list = []
        for i, col in enumerate(self.cat_cols):
            # x_cat[:,:,i] : (B, T)
            emb = self.emb_layers[col](x_cat[:, :, i])  # (B, T, emb_dim)
            emb_list.append(emb)
        x_emb = torch.cat(emb_list, dim=-1)  # (B, T, sum_emb)

        # 2) concat numeric + embedding
        x = torch.cat([x_cont, x_emb], dim=-1)  # (B, T, cont+emb)
        x = self.input_proj(x)                  # (B, T, d_model)

        # 3) positional encoding
        x = self.pos_encoder(x)                 # (B, T, d_model)

        # 4) key_padding_mask: True → 무시할 위치 (pad)
        # 현재 mask: 1(valid), 0(pad) 이므로 반전
        key_padding_mask = (mask == 0)          # (B, T), bool

        # 5) Transformer Encoder
        x_enc = self.transformer(
            x, src_key_padding_mask=key_padding_mask
        )  # (B, T, d_model)

        # 6) masked mean pooling
        mask_f = mask.unsqueeze(-1)             # (B, T, 1)
        x_enc_masked = x_enc * mask_f           # pad 위치는 0
        sum_enc = x_enc_masked.sum(dim=1)       # (B, d_model)
        len_valid = mask_f.sum(dim=1).clamp(min=1e-6)  # (B,1)
        pooled = sum_enc / len_valid

        # 7) regression head
        out = self.fc(pooled)                   # (B, 2)
        return out

### 10.6.4 Train Loop

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

model = TransformerFinalPassRegressor(
    cont_dim=len(CONT_COLS),
    num_classes_dict=num_classes,
    d_model=128,
    nhead=4,
    num_layers=3,        # 2~3 정도부터 시작해봐도 좋음
    dim_feedforward=256,
    dropout=0.2,
    max_len=MAX_LEN + 5, # 여유 있게
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.MSELoss()

def euclidean(pred, target):
    # pred, target: (B, 2)
    return torch.sqrt(((pred - target) ** 2).sum(dim=1)).mean().item()

EPOCHS = 50

for epoch in range(EPOCHS):
    # -------- Train --------
    model.train()
    train_loss = 0.0

    for x_cont, x_cat, mask, y in train_loader:
        x_cont = x_cont.to(device)
        x_cat  = x_cat.to(device)
        mask   = mask.to(device)
        y      = y.to(device)

        pred = model(x_cont, x_cat, mask)
        loss = criterion(pred, y)

        optimizer.zero_grad()
        loss.backward()
        # gradient clipping (Transformer에서 종종 도움 됨)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()

    # -------- Valid --------
    model.eval()
    val_loss = 0.0
    val_dist = 0.0

    with torch.no_grad():
        for x_cont, x_cat, mask, y in valid_loader:
            x_cont = x_cont.to(device)
            x_cat  = x_cat.to(device)
            mask   = mask.to(device)
            y      = y.to(device)

            pred = model(x_cont, x_cat, mask)
            loss = criterion(pred, y)

            val_loss += loss.item()
            val_dist += euclidean(pred, y)

    print(
        f"[Epoch {epoch}] "
        f"TrainLoss={train_loss:.4f} | "
        f"ValidLoss={val_loss:.4f} | "
        f"Euclid={val_dist:.4f}"
    )

torch.save(model.state_dict(), "Data/model_transformer.pt")

print("Saved Transformer model!")