# 1. Import

In [None]:
import warnings
warnings.filterwarnings('ignore')

%cd "/content/drive/MyDrive/데이터 분석/projects/ML_portfolio/10_kleague_final_pass_prediction"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader

# 2. Hyperparameter Setting

In [None]:
TRAIN_PATH = "Data/train.csv"
BATCH_SIZE = 64
EPOCHS = 30
LR = 1e-3
HIDDEN_DIM = 96
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# 3. Data Load & Preprocessing

In [None]:
# event 단순화 매핑 함수
def simplify_event(t: str) -> str:
    if t in ["Pass", "Pass_Freekick", "Pass_Corner"]:
        return "Pass"
    if t == "Carry":
        return "Carry"
    if t in ["Duel", "Tackle", "Interception", "Recovery"]:
        return "Duel_Turnover"
    if t == "Cross":
        return "Cross"
    if t.startswith("Shot"):
        return "Shot"
    if t == "Penalty Kick":
        return "Shot"
    if t in ["Clearance", "Aerial Clearance"]:
        return "Clearance"
    if t in ["Catch", "Parry", "Goal Kick", "Keeper Rush-Out"]:
        return "GK_Action"
    if t in ["Block", "Deflection", "Intervention", "Hit"]:
        return "Deflect_Block"
    if t == "Throw-In":
        return "SetPiece"
    if t in ["Goal", "Own Goal"]:
        return "Goal_Event"
    if t in ["Error", "Out", "Foul", "Foul_Throw", "Handball_Foul", "Offside"]:
        return "Error_Out"
    return "Misc"

def build_episode_sequence(g: pd.DataFrame, for_train: bool = True):
    """
    한 game_episode의 데이터프레임 g를 받아서
    - seq: [T, F] numeric feature 시퀀스
    - target: [2] (정규화된 end_x, end_y)
    을 반환하는 함수.
    """
    g = g.reset_index(drop=True).copy()
    if len(g) < 2:
        return None, None, None

    # 기본 좌표 & 시간
    sx = g["start_x"].values
    sy = g["start_y"].values
    t  = g["time_seconds"].values

    # 이동량
    dx = np.diff(sx, prepend=sx[0])
    dy = np.diff(sy, prepend=sy[0])
    dist = np.sqrt(dx**2 + dy**2)
    angle = np.arctan2(dy, dx)  # -pi ~ pi

    # 시간차
    dt = np.diff(t, prepend=t[0])
    dt[dt < 0] = 0  # 역전 방지

    # 누적 이동량
    cum_dx = np.cumsum(dx)
    cum_dy = np.cumsum(dy)
    move_norm = np.sqrt(cum_dx**2 + cum_dy**2)

    # episode 내부 상대 위치
    T = len(g)
    step_idx = np.arange(T)
    step_idx_norm = step_idx / (T - 1) if T > 1 else np.zeros(T)

    # 상대 시간 (0~1)
    t_min, t_max = t.min(), t.max()
    time_rel = (t - t_min) / (t_max - t_min) if t_max > t_min else np.zeros(T)

    # -------------------------
    # 정규화 (EDA 기반 스케일링)
    # -------------------------
    sx_norm = sx / 105.0
    sy_norm = sy / 68.0

    dx_norm = dx / 40.0
    dy_norm = dy / 40.0
    dist_norm = dist / 40.0
    angle_norm = angle / np.pi

    # dt: 3초를 넘는 경우는 1로 클립
    dt_norm = np.clip(dt / 3.0, 0, 1)

    cum_dx_norm = cum_dx / 60.0
    cum_dy_norm = cum_dy / 60.0
    move_norm_norm = move_norm / 60.0

    # feature matrix 구성: [T, F]
    feats = np.stack([
        sx_norm, sy_norm,         # 2
        dx_norm, dy_norm,         # 2
        dist_norm,                # 1
        angle_norm,               # 1
        dt_norm,                  # 1
        cum_dx_norm, cum_dy_norm, # 2
        move_norm_norm,           # 1
        step_idx_norm,            # 1
        time_rel                  # 1
    ], axis=1).astype("float32")  # 총 12차원

    event_idx = g["event_s"].apply(lambda x: event2idx[x]).values.astype("int64")

    # target: 마지막 end_x, end_y
    target = None
    if for_train:
        ex = g["end_x"].values[-1] / 105.0
        ey = g["end_y"].values[-1] / 68.0
        target = np.array([ex, ey], dtype="float32")

    return feats, event_idx, target

In [None]:
df = pd.read_csv(TRAIN_PATH)
df = df.sort_values(["game_episode", "time_seconds"]).reset_index(drop=True)

df["event_s"] = df["type_name"].astype(str).apply(simplify_event)

event_vocab = sorted(df["event_s"].unique())
event2idx = {ev: i for i, ev in enumerate(event_vocab)}

print("Event vocabulary size:", len(event2idx))
print(event2idx)

In [None]:
episodes = []
events = []
targets = []

for _, g in tqdm(df.groupby("game_episode")):
    seq, ev, tgt = build_episode_sequence(g, for_train=True)
    if seq is None:
        continue
    episodes.append(seq)
    events.append(ev)
    targets.append(tgt)

print("Episode count:", len(episodes))
print("Example shape:", episodes[0].shape)



```
100%|██████████| 15435/15435 [00:09<00:00, 1553.36it/s]

Episode count: 15428
Example shape: (49, 12)
```



# 4. Custom Dataset / DataLoader 정의 및 Validation 분할

In [None]:
class EpisodeDataset(Dataset):
    def __init__(self, episodes, events, targets):
        self.episodes = episodes
        self.events = events
        self.targets = targets

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        seq = torch.tensor(self.episodes[idx])     # [T, 12]
        ev  = torch.tensor(self.events[idx])       # [T]
        tgt = torch.tensor(self.targets[idx])      # [2]
        length = seq.size(0)
        return seq, ev, length, tgt

def collate_fn(batch):
    seqs, evs, lengths, tgts = zip(*batch)

    lengths = torch.tensor(lengths, dtype=torch.long)
    tgts = torch.stack(tgts, dim=0)

    padded_seq = pad_sequence(seqs, batch_first=True)             # [B, T, 12]
    padded_ev  = pad_sequence(evs, batch_first=True, padding_value=0)  # [B, T]

    return padded_seq, padded_ev, lengths, tgts

# 에피소드 단위 train / valid split
idx_train, idx_valid = train_test_split(
    np.arange(len(episodes)), test_size=0.2, random_state=42
)

episodes_train = [episodes[i] for i in idx_train]
events_train   = [events[i]   for i in idx_train]
targets_train  = [targets[i]  for i in idx_train]

episodes_valid = [episodes[i] for i in idx_valid]
events_valid   = [events[i]   for i in idx_valid]
targets_valid  = [targets[i]  for i in idx_valid]

train_loader = DataLoader(
    EpisodeDataset(episodes_train, events_train, targets_train),
    batch_size=64, shuffle=True, collate_fn=collate_fn
)

valid_loader = DataLoader(
    EpisodeDataset(episodes_valid, events_valid, targets_valid),
    batch_size=64, shuffle=False, collate_fn=collate_fn
)

print("train episodes:", len(episodes_train), "valid episodes:", len(episodes_valid))



```
train episodes: 12342 valid episodes: 3086
```



# 5. LSTM 베이스라인 + 이동량 모델 정의

In [None]:
class LSTMWithEventEmbedding(nn.Module):
    def __init__(self, num_feats=12, hidden_dim=96, emb_dim=6):
        super().__init__()

        self.event_emb = nn.Embedding(len(event2idx), emb_dim)

        input_dim = num_feats + emb_dim   # 12 + 6 = 18

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, seq, ev, lengths):
        ev_e = self.event_emb(ev)               # [B, T, 6]
        x = torch.cat([seq, ev_e], dim=2)       # [B, T, 18]

        packed = pack_padded_sequence(
            x, lengths.cpu(),
            batch_first=True,
            enforce_sorted=False
        )
        _, (h, _) = self.lstm(packed)
        h = h[-1]                               # [B, hidden_dim]

        return self.fc(h)

# 6. 모델 학습 및 검증

In [None]:
model = LSTMWithEventEmbedding().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

best_dist = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    # --- Train ---
    model.train()
    total_loss = 0.0

    for X, EV, L, y in tqdm(train_loader):
        X, EV, L, y = X.to(DEVICE), EV.to(DEVICE), L.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad()
        pred = model(X, EV, L)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X.size(0)

    train_loss = total_loss / len(train_loader.dataset)

    # --- Valid: 평균 유클리드 거리 ---
    model.eval()
    dists = []

    with torch.no_grad():
        for X, EV, L, y in tqdm(valid_loader):
            X, EV, L, y = X.to(DEVICE), EV.to(DEVICE), L.to(DEVICE), y.to(DEVICE)

            pred = model(X, EV, L)

            pred_np = pred.cpu().numpy()
            true_np = y.cpu().numpy()

            px = pred_np[:,0] * 105
            py = pred_np[:,1] * 68
            tx = true_np[:,0] * 105
            ty = true_np[:,1] * 68

            dist = np.sqrt((px - tx)**2 + (py - ty)**2)
            dists.append(dist)

    mean_dist = np.concatenate(dists).mean()

    print(f"[Epoch {epoch}] train={train_loss:.4f} | valid_dist={mean_dist:.4f}")

    if mean_dist < best_dist:
        best_dist = mean_dist
        best_state = model.state_dict()
        print(f" → Best updated: {best_dist:.4f}")



```
# baseline
[Epoch 18] train_loss=0.0304 | valid_mean_dist=16.6270
 --> Best model updated! (dist=16.6270)

# hidden_dim=96으로 수정 후
[Epoch 27] train_loss=0.0282 | valid_mean_dist=16.3134
 --> Best model updated! (dist=16.3134)

# hidden_dim=96으로 수정 + 컬럼 스케일링 범위 조정
[Epoch 28] train_loss=0.0287 | valid_mean_dist=16.2878
 --> Best model updated! (dist=16.2878)

# Event Embedding 추가한 후
[Epoch 30] train_loss=0.0267 | valid_mean_dist=15.8609
 --> Best model updated! (dist=15.8609)
```



# 7. 평가 데이터셋 추론

In [None]:
# ---------------------------------------------------------
# 0. 준비: 모델 로드
# ---------------------------------------------------------
model.load_state_dict(best_state)
model.eval()

# ---------------------------------------------------------
# 1. Test 메타 로드
# ---------------------------------------------------------
test_meta = pd.read_csv("Data/test.csv")
submission = pd.read_csv("Data/sample_submission.csv")
submission = submission.merge(test_meta, on="game_episode", how="left")

# ---------------------------------------------------------
# 2. test 파일 전체 캐싱 — 속도 10~20배 빨라짐
# ---------------------------------------------------------
def load_all_test_files(test_meta, base_dir="Data"):
    cache = {}
    for path in tqdm(test_meta["path"].unique(), desc="Loading test files"):
        clean = path[1:]              # "/XXX.csv" → "XXX.csv"
        full_path = base_dir + clean
        df = pd.read_csv(full_path)
        cache[path] = df
    return cache

file_cache = load_all_test_files(test_meta)

# ---------------------------------------------------------
# 3. Inference
# ---------------------------------------------------------
preds_x, preds_y = [], []

with torch.no_grad():
    for _, row in tqdm(submission.iterrows(), total=len(submission), desc="Inference"):

        # 캐시에서 해당 episode 로드 (매우 빠름)
        g = file_cache[row["path"]].copy()

        # event_s와 그룹 매핑 적용
        g["event_s"] = g["type_name"].astype(str).apply(simplify_event)

        # Step 2 build_episode_sequence 호출
        seq, ev, _ = build_episode_sequence(g, for_train=False)

        # ---- 방어: seq가 None이면 fallback 좌표 사용 ----
        if seq is None:
            sx = g["start_x"].values[-1] / 105.0
            sy = g["start_y"].values[-1] / 68.0
            pred_norm = np.array([sx, sy], dtype="float32")

        else:
            # 텐서화
            X = torch.tensor(seq, dtype=torch.float32).unsqueeze(0).to(DEVICE)   # [1, T, 12]
            EV = torch.tensor(ev, dtype=torch.long).unsqueeze(0).to(DEVICE)      # [1, T]
            L  = torch.tensor([seq.shape[0]], dtype=torch.long).to(DEVICE)       # [1]

            # 모델 추론
            pred = model(X, EV, L)[0].cpu().numpy()
            pred_norm = pred

        # 정규화 좌표 → 실제 좌표
        preds_x.append(pred_norm[0] * 105.0)
        preds_y.append(pred_norm[1] * 68.0)



```
Loading test files: 100%|██████████| 2414/2414 [08:54<00:00,  4.51it/s]
Inference: 100%|██████████| 2414/2414 [00:05<00:00, 470.82it/s]
```



# 8. 제출 Submission 생성

In [None]:
submission["end_x"] = preds_x
submission["end_y"] = preds_y

save_path = "Data/step2_submit.csv"
submission[["game_episode", "end_x", "end_y"]].to_csv(save_path, index=False)

print("Saved:", save_path)