# 1. Import

In [None]:
import warnings
warnings.filterwarnings('ignore')

%cd "/content/drive/MyDrive/데이터 분석/projects/ML_portfolio/10_kleague_final_pass_prediction"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

# 2. Hyperparameter Setting

In [None]:
TRAIN_PATH = "Data/train.csv"
BATCH_SIZE = 64
EPOCHS = 50
LR = 1e-3
HIDDEN_DIM = 96
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# 3. Data Load & Preprocessing

In [None]:
# event 단순화 매핑 함수
def simplify_event(t: str) -> str:
    if t in ["Pass", "Pass_Freekick", "Pass_Corner"]:
        return "Pass"
    if t == "Carry":
        return "Carry"
    if t in ["Duel", "Tackle", "Interception", "Recovery"]:
        return "Duel_Turnover"
    if t == "Cross":
        return "Cross"
    if t.startswith("Shot"):
        return "Shot"
    if t == "Penalty Kick":
        return "Shot"
    if t in ["Clearance", "Aerial Clearance"]:
        return "Clearance"
    if t in ["Catch", "Parry", "Goal Kick", "Keeper Rush-Out"]:
        return "GK_Action"
    if t in ["Block", "Deflection", "Intervention", "Hit"]:
        return "Deflect_Block"
    if t == "Throw-In":
        return "SetPiece"
    if t in ["Goal", "Own Goal"]:
        return "Goal_Event"
    if t in ["Error", "Out", "Foul", "Foul_Throw", "Handball_Foul", "Offside"]:
        return "Error_Out"
    return "Misc"

# 결과 단순화 함수
def simplify_result(result_name):
    if result_name in ["Successful", "On Target", "Goal"]:
        return "Success"
    if result_name in ["Unsuccessful", "Off Target", "Blocked"]:
        return "Fail"
    return "None"

def build_episode_sequence(g: pd.DataFrame, for_train: bool = True):
    """
    한 game_episode의 데이터프레임 g를 받아서
    - seq: [T, F] numeric feature 시퀀스
    - target: [2] (정규화된 end_x, end_y)
    을 반환하는 함수.
    """
    g = g.reset_index(drop=True).copy()
    if len(g) < 2:
        return None, None, None, None

    # 기본 좌표 & 시간
    sx = g["start_x"].values
    sy = g["start_y"].values
    t  = g["time_seconds"].values

    # 이동량
    dx = np.diff(sx, prepend=sx[0])
    dy = np.diff(sy, prepend=sy[0])
    dist = np.sqrt(dx**2 + dy**2)
    angle = np.arctan2(dy, dx)  # -pi ~ pi

    # 시간차
    dt = np.diff(t, prepend=t[0])
    dt[dt < 0] = 0  # 역전 방지

    # 누적 이동량
    cum_dx = np.cumsum(dx)
    cum_dy = np.cumsum(dy)
    move_norm = np.sqrt(cum_dx**2 + cum_dy**2)

    # episode 내부 상대 위치
    T = len(g)
    step_idx = np.arange(T)
    step_idx_norm = step_idx / (T - 1) if T > 1 else np.zeros(T)

    # 상대 시간 (0~1)
    t_min, t_max = t.min(), t.max()
    time_rel = (t - t_min) / (t_max - t_min) if t_max > t_min else np.zeros(T)

    # -------------------------
    # 정규화 (EDA 기반 스케일링)
    # -------------------------
    sx_norm = sx / 105.0
    sy_norm = sy / 68.0

    dx_norm = dx / 40.0
    dy_norm = dy / 40.0
    dist_norm = dist / 40.0
    angle_norm = angle / np.pi

    # dt: 3초를 넘는 경우는 1로 클립
    dt_norm = np.clip(dt / 3.0, 0, 1)

    cum_dx_norm = cum_dx / 60.0
    cum_dy_norm = cum_dy / 60.0
    move_norm_norm = move_norm / 60.0

    # feature matrix 구성: [T, F]
    feats = np.stack([
        sx_norm, sy_norm,         # 2
        dx_norm, dy_norm,         # 2
        dist_norm,                # 1
        angle_norm,               # 1
        dt_norm,                  # 1
        cum_dx_norm, cum_dy_norm, # 2
        move_norm_norm,           # 1
        step_idx_norm,            # 1
        time_rel                  # 1
    ], axis=1).astype("float32")  # 총 12차원

    if "event_s" in g.columns:
        event_idx = g["event_s"].apply(lambda x: event2idx[x]).values.astype("int64")
    else:
        tmp_event = g["type_name"].astype(str).apply(simplify_event)
        event_idx = tmp_event.apply(lambda x: event2idx[x]).values.astype("int64")

    if "result_s" in g.columns:
        result_idx = g["result_s"].apply(lambda x: result2idx[x]).values.astype("int64")
    else:
        tmp_result = g["result_name"].astype(str).apply(simplify_result)
        result_idx = tmp_result.apply(lambda x: result2idx[x]).values.astype("int64")

    # target: 마지막 end_x, end_y
    target = None

    if for_train:
        ex = g["end_x"].values[-1] / 105.0
        ey = g["end_y"].values[-1] / 68.0
        target = np.array([ex, ey], dtype="float32")

    return feats, event_idx, result_idx, target

In [None]:
df = pd.read_csv(TRAIN_PATH)
df = df.sort_values(["game_episode", "time_seconds"]).reset_index(drop=True)

df["event_s"] = df["type_name"].astype(str).apply(simplify_event)
df["result_s"] = df["result_name"].astype(str).apply(simplify_result)

event_vocab = sorted(df["event_s"].unique())
event2idx = {ev: i for i, ev in enumerate(event_vocab)}

result_vocab = sorted(df["result_s"].unique())
result2idx = {rs: i for i, rs in enumerate(result_vocab)}

print("Event vocabulary size:", len(event2idx), event2idx)
print("Result vocabulary size:", len(result2idx), result2idx)



```
Event vocabulary size: 11 {'Carry': 0, 'Clearance': 1, 'Cross': 2, 'Deflect_Block': 3, 'Duel_Turnover': 4, 'Error_Out': 5, 'GK_Action': 6, 'Misc': 7, 'Pass': 8, 'SetPiece': 9, 'Shot': 10}
Result vocabulary size: 3 {'Fail': 0, 'None': 1, 'Success': 2}
```



In [None]:
episodes, events, results, targets = [], [], [], []

for _, g in tqdm(df.groupby("game_episode")):
    seq, ev, rs, tgt = build_episode_sequence(g)
    if seq is None:
        continue
    episodes.append(seq)
    events.append(ev)
    results.append(rs)
    targets.append(tgt)

print("Episode count:", len(episodes))
print("Example episode shape:", episodes[0].shape)
print("Example event len:", len(events[0]), "result len:", len(results[0]))



```
100%|██████████| 15435/15435 [00:13<00:00, 1107.46it/s]

Episode count: 15428
Example episode shape: (49, 12)
Example event len: 49 result len: 49
```



# 4. Custom Dataset / DataLoader 정의 및 Validation 분할

In [None]:
class EpisodeDataset(torch.utils.data.Dataset):
    def __init__(self, seqs, evs, rss, tgts):
        self.seqs = seqs
        self.evs = evs
        self.rss = rss
        self.tgts = tgts

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        seq = torch.tensor(self.seqs[idx])        # [T, 12]
        ev  = torch.tensor(self.evs[idx])         # [T]
        rs  = torch.tensor(self.rss[idx])         # [T]
        tgt = torch.tensor(self.tgts[idx])        # [2]
        return seq, ev, rs, seq.size(0), tgt

def collate_fn(batch):
    seqs, evs, rss, lengths, tgts = zip(*batch)

    lengths = torch.tensor(lengths)
    tgts    = torch.stack(tgts)

    seqs_p = pad_sequence(seqs, batch_first=True)
    evs_p  = pad_sequence(evs, batch_first=True, padding_value=0)
    rss_p  = pad_sequence(rss, batch_first=True, padding_value=0)

    return seqs_p, evs_p, rss_p, lengths, tgts

idx_train, idx_valid = train_test_split(
    np.arange(len(episodes)), test_size=0.2, random_state=42
)

train_loader = DataLoader(
    EpisodeDataset(
        [episodes[i] for i in idx_train],
        [events[i]   for i in idx_train],
        [results[i]  for i in idx_train],
        [targets[i]  for i in idx_train],
    ),
    batch_size=64, shuffle=True, collate_fn=collate_fn
)

valid_loader = DataLoader(
    EpisodeDataset(
        [episodes[i] for i in idx_valid],
        [events[i]   for i in idx_valid],
        [results[i]  for i in idx_valid],
        [targets[i]  for i in idx_valid],
    ),
    batch_size=64, shuffle=False, collate_fn=collate_fn
)

print("train episodes:", len(idx_train), "valid episodes:", len(idx_valid))



```
train episodes: 12342 valid episodes: 3086
```



# 5. LSTM 베이스라인 + 이동량 모델 정의

In [None]:
class FinalPassLSTMWithLastK(nn.Module):
    def __init__(
        self,
        num_feats=12,
        event_emb_dim=6,
        result_emb_dim=3,
        hidden_dim=96,
        k_last=3,
        num_layers=1,
    ):
        super().__init__()

        self.event_emb = nn.Embedding(len(event2idx), event_emb_dim)
        self.result_emb = nn.Embedding(len(result2idx), result_emb_dim)

        input_dim = num_feats + event_emb_dim + result_emb_dim

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )

        self.k_last = k_last
        self.lastk_mlp = nn.Sequential(
            nn.Linear(hidden_dim * k_last, hidden_dim),
            nn.ReLU(),
        )

        self.fc = nn.Linear(hidden_dim * 2, 2)

    def forward(self, seq, ev, rs, lengths):

        ev_e = self.event_emb(ev)
        rs_e = self.result_emb(rs)
        x = torch.cat([seq, ev_e, rs_e], dim=-1)   # [B, T, D]

        packed = pack_padded_sequence(
            x, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_out, (h_n, c_n) = self.lstm(packed)
        out_padded, _ = pad_packed_sequence(packed_out, batch_first=True)

        h_context = h_n[-1]  # [B, H]

        B, T, H = out_padded.size()
        lastk_list = []

        for i in range(B):
            L = lengths[i].item()
            end = L
            start = max(0, end - self.k_last)
            lastk = out_padded[i, start:end]

            if lastk.size(0) < self.k_last:
                pad = torch.zeros(
                    self.k_last - lastk.size(0),
                    H,
                    device=seq.device,
                )
                lastk = torch.cat([pad, lastk], dim=0)

            lastk_list.append(lastk.reshape(-1))

        lastk_tensor = torch.stack(lastk_list)

        h_lastk = self.lastk_mlp(lastk_tensor)

        h = torch.cat([h_context, h_lastk], dim=1)
        out = self.fc(h)

        return out

# 6. 모델 학습 및 검증

In [None]:
model = FinalPassLSTMWithLastK().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

best_dist = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    # --- Train ---
    model.train()
    total_loss = 0

    for seq, ev, rs, lengths, tgt in train_loader:
        seq, ev, rs, lengths, tgt = (
            seq.to(DEVICE), ev.to(DEVICE), rs.to(DEVICE),
            lengths.to(DEVICE), tgt.to(DEVICE)
        )

        optimizer.zero_grad()
        pred = model(seq, ev, rs, lengths)
        loss = criterion(pred, tgt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * seq.size(0)

    train_loss = total_loss / len(train_loader.dataset)

    # --- Valid: 평균 유클리드 거리 ---
    model.eval()
    dists = []

    with torch.no_grad():
        for seq, ev, rs, lengths, tgt in valid_loader:
            seq, ev, rs, lengths, tgt = (
                seq.to(DEVICE), ev.to(DEVICE), rs.to(DEVICE),
                lengths.to(DEVICE), tgt.to(DEVICE)
            )

            pred = model(seq, ev, rs, lengths)

            pred_np = pred.cpu().numpy()
            tgt_np = tgt.cpu().numpy()

            px = pred_np[:, 0] * 105
            py = pred_np[:, 1] * 68
            tx = tgt_np[:, 0] * 105
            ty = tgt_np[:, 1] * 68

            dist = np.sqrt((px - tx)**2 + (py - ty)**2)
            dists.append(dist)

    mean_dist = np.concatenate(dists).mean()

    print(f"[Epoch {epoch}] train_loss={train_loss:.4f} | valid_mean_dist={mean_dist:.4f}")

    if mean_dist < best_dist:
        best_dist = mean_dist
        best_state = model.state_dict()
        print(f"--> Best updated: {best_dist:.4f}")



```
# baseline
[Epoch 18] train_loss=0.0304 | valid_mean_dist=16.6270
 --> Best model updated! (dist=16.6270)

# hidden_dim=96으로 수정 후
[Epoch 27] train_loss=0.0282 | valid_mean_dist=16.3134
 --> Best model updated! (dist=16.3134)

# hidden_dim=96으로 수정 + 컬럼 스케일링 범위 조정
[Epoch 28] train_loss=0.0287 | valid_mean_dist=16.2878
 --> Best model updated! (dist=16.2878)

# Event Embedding 추가한 후
[Epoch 30] train_loss=0.0267 | valid_mean_dist=15.8609
 --> Best model updated! (dist=15.8609)

# Result Embedding 추가한 후
[Epoch 26] train_loss=0.0262 | valid_mean_dist=15.6801
--> Best updated: 15.6801

# Last-k Event 추가한 후
[Epoch 19] train_loss=0.0263 | valid_mean_dist=15.6266
--> Best updated: 15.6266
```



# 7. 평가 데이터셋 추론

In [None]:
# ---------------------------------------------------------
# 0. 준비: 모델 로드
# ---------------------------------------------------------
model.load_state_dict(best_state)
model.eval()

# ---------------------------------------------------------
# 1. Test 메타 로드
# ---------------------------------------------------------
test_meta = pd.read_csv("Data/test.csv")
submission = pd.read_csv("Data/sample_submission.csv")
submission = submission.merge(test_meta, on="game_episode", how="left")

# ---------------------------------------------------------
# 2. test 파일 전체 캐싱 — 속도 10~20배 빨라짐
# ---------------------------------------------------------
def load_all_test_files(test_meta, base_dir="Data"):
    cache = {}
    for path in tqdm(test_meta["path"].unique(), desc="Loading test files"):
        clean = path[1:]              # "/XXX.csv" → "XXX.csv"
        full_path = base_dir + clean
        df = pd.read_csv(full_path)
        cache[path] = df
    return cache

file_cache = load_all_test_files(test_meta)

# ---------------------------------------------------------
# 3. Inference
# ---------------------------------------------------------
preds_x, preds_y = [], []

with torch.no_grad():
    for _, row in tqdm(submission.iterrows(), total=len(submission), desc="Inference"):
        g = file_cache[row["path"]].copy()

        # event/result 단순화 컬럼 생성
        g["event_s"] = g["type_name"].astype(str).apply(simplify_event)
        g["result_s"] = g["result_name"].astype(str).apply(simplify_result)

        seq, ev, rs, _ = build_episode_sequence(g, for_train=False)

        seq = torch.tensor(seq).unsqueeze(0).to(DEVICE)
        ev  = torch.tensor(ev).unsqueeze(0).to(DEVICE)
        rs  = torch.tensor(rs).unsqueeze(0).to(DEVICE)
        L   = torch.tensor([seq.shape[1]]).to(DEVICE)

        pred = model(seq, ev, rs, L)[0].cpu().numpy()

        preds_x.append(pred[0] * 105)
        preds_y.append(pred[1] * 68)



```
Loading test files: 100%|██████████| 2414/2414 [08:54<00:00,  4.51it/s]
Inference: 100%|██████████| 2414/2414 [00:05<00:00, 470.82it/s]
```



# 8. 제출 Submission 생성

In [None]:
submission["end_x"] = preds_x
submission["end_y"] = preds_y
submission[["game_episode", "end_x", "end_y"]].to_csv("Data/step4_submit.csv", index=False)

print("Saved: Data/step4_submit.csv")