# 1. Import

In [None]:
import warnings
warnings.filterwarnings('ignore')

%cd "/content/drive/MyDrive/데이터 분석/projects/ML_portfolio/10_kleague_final_pass_prediction"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader

# 2. Hyperparameter Setting

In [None]:
TRAIN_PATH = "Data/train.csv"
BATCH_SIZE = 64
EPOCHS = 30
LR = 1e-3
HIDDEN_DIM = 96
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# 3. Data Load & Preprocessing

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

df = pd.read_csv(TRAIN_PATH)
df = df.sort_values(["game_episode", "time_seconds"]).reset_index(drop=True)

def build_episode_sequence(g: pd.DataFrame, for_train: bool = True):
    """
    한 game_episode의 데이터프레임 g를 받아서
    - seq: [T, F] numeric feature 시퀀스
    - target: [2] (정규화된 end_x, end_y)
    을 반환하는 함수.
    """
    g = g.reset_index(drop=True).copy()
    if len(g) < 2:
        return None, None

    # 기본 좌표 & 시간
    sx = g["start_x"].values
    sy = g["start_y"].values
    t  = g["time_seconds"].values

    # 이동량
    dx = np.diff(sx, prepend=sx[0])
    dy = np.diff(sy, prepend=sy[0])
    dist = np.sqrt(dx**2 + dy**2)
    angle = np.arctan2(dy, dx)  # -pi ~ pi

    # 시간차
    dt = np.diff(t, prepend=t[0])
    dt[dt < 0] = 0  # 역전 방지

    # 누적 이동량
    cum_dx = np.cumsum(dx)
    cum_dy = np.cumsum(dy)
    move_norm = np.sqrt(cum_dx**2 + cum_dy**2)

    # episode 내부 상대 위치
    T = len(g)
    step_idx = np.arange(T)
    step_idx_norm = step_idx / (T - 1) if T > 1 else np.zeros(T)

    # 상대 시간 (0~1)
    t_min, t_max = t.min(), t.max()
    time_rel = (t - t_min) / (t_max - t_min) if t_max > t_min else np.zeros(T)

    # -------------------------
    # 정규화 (EDA 기반 스케일링)
    # -------------------------
    sx_norm = sx / 105.0
    sy_norm = sy / 68.0

    dx_norm = dx / 40.0
    dy_norm = dy / 40.0
    dist_norm = dist / 40.0
    angle_norm = angle / np.pi

    # dt: 3초를 넘는 경우는 1로 클립
    dt_norm = np.clip(dt / 3.0, 0, 1)

    cum_dx_norm = cum_dx / 60.0
    cum_dy_norm = cum_dy / 60.0
    move_norm_norm = move_norm / 60.0

    # feature matrix 구성: [T, F]
    feats = np.stack([
        sx_norm, sy_norm,         # 2
        dx_norm, dy_norm,         # 2
        dist_norm,                # 1
        angle_norm,               # 1
        dt_norm,                  # 1
        cum_dx_norm, cum_dy_norm, # 2
        move_norm_norm,           # 1
        step_idx_norm,            # 1
        time_rel                  # 1
    ], axis=1).astype("float32")  # 총 12차원

    # target: 마지막 end_x, end_y
    target = None
    if for_train:
        ex = g["end_x"].values[-1] / 105.0
        ey = g["end_y"].values[-1] / 68.0
        target = np.array([ex, ey], dtype="float32")

    return feats, target

episodes = []
targets = []

for _, g in tqdm(df.groupby("game_episode")):
    seq, tgt = build_episode_sequence(g, for_train=True)
    if seq is None:
        continue
    episodes.append(seq)
    targets.append(tgt)

print("에피소드 수:", len(episodes))
print("예시 seq shape:", episodes[0].shape)



```
100%|██████████| 15435/15435 [00:07<00:00, 2065.18it/s]

에피소드 수: 15428
예시 seq shape: (49, 12)
```



# 4. Custom Dataset / DataLoader 정의 및 Validation 분할

In [None]:
class EpisodeDataset(Dataset):
    def __init__(self, episodes, targets):
        self.episodes = episodes
        self.targets = targets

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        seq = torch.tensor(self.episodes[idx])   # [T, 12]
        tgt = torch.tensor(self.targets[idx])    # [2]
        length = seq.size(0)
        return seq, length, tgt

def collate_fn(batch):
    seqs, lengths, tgts = zip(*batch)
    lengths = torch.tensor(lengths, dtype=torch.long)
    padded = pad_sequence(seqs, batch_first=True)  # [B, T, 12]
    tgts = torch.stack(tgts, dim=0)                # [B, 2]
    return padded, lengths, tgts

# 에피소드 단위 train / valid split
idx_train, idx_valid = train_test_split(
    np.arange(len(episodes)), test_size=0.2, random_state=42
)

episodes_train = [episodes[i] for i in idx_train]
targets_train  = [targets[i]  for i in idx_train]
episodes_valid = [episodes[i] for i in idx_valid]
targets_valid  = [targets[i]  for i in idx_valid]

train_loader = DataLoader(
    EpisodeDataset(episodes_train, targets_train),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

valid_loader = DataLoader(
    EpisodeDataset(episodes_valid, targets_valid),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

print("train episodes:", len(episodes_train), "valid episodes:", len(episodes_valid))



```
train episodes: 12342 valid episodes: 3086
```



# 5. LSTM 베이스라인 + 이동량 모델 정의

In [None]:
NUM_FEATS = episodes[0].shape[1]  # 12

class LSTMBaseline(nn.Module):
    def __init__(self, input_dim=NUM_FEATS, hidden_dim=64):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim, 2)  # (x_norm, y_norm)

    def forward(self, x, lengths):
        # x: [B, T, F], lengths: [B]
        packed = pack_padded_sequence(
            x, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h_last = h_n[-1]      # [B, H] 마지막 layer의 hidden state
        out = self.fc(h_last) # [B, 2]
        return out

model = LSTMBaseline(input_dim=NUM_FEATS, hidden_dim=HIDDEN_DIM).to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# 6. 모델 학습 및 검증

In [None]:
best_dist = float("inf")
best_model_state = None

for epoch in range(1, EPOCHS + 1):
    # --- Train ---
    model.train()
    total_loss = 0.0

    for X, lengths, y in tqdm(train_loader):
        X, lengths, y = X.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad()
        pred = model(X, lengths)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X.size(0)

    train_loss = total_loss / len(train_loader.dataset)

    # --- Valid: 평균 유클리드 거리 ---
    model.eval()
    dists = []

    with torch.no_grad():
        for X, lengths, y in tqdm(valid_loader):
            X, lengths, y = X.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)
            pred = model(X, lengths)

            pred_np = pred.cpu().numpy()
            true_np = y.cpu().numpy()

            pred_x = pred_np[:, 0] * 105.0
            pred_y = pred_np[:, 1] * 68.0
            true_x = true_np[:, 0] * 105.0
            true_y = true_np[:, 1] * 68.0

            dist = np.sqrt((pred_x - true_x) ** 2 + (pred_y - true_y) ** 2)
            dists.append(dist)

    mean_dist = np.concatenate(dists).mean()

    print(
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} | "
        f"valid_mean_dist={mean_dist:.4f}"
    )

    if mean_dist < best_dist:
        best_dist = mean_dist
        best_model_state = model.state_dict().copy()
        print(f" --> Best model updated! (dist={best_dist:.4f})")



```
# baseline
[Epoch 18] train_loss=0.0304 | valid_mean_dist=16.6270
 --> Best model updated! (dist=16.6270)

# hidden_dim=96으로 수정 후
 [Epoch 27] train_loss=0.0282 | valid_mean_dist=16.3134
 --> Best model updated! (dist=16.3134)

# hidden_dim=96으로 수정 + 컬럼 스케일링 범위 조정
 [Epoch 28] train_loss=0.0287 | valid_mean_dist=16.2878
 --> Best model updated! (dist=16.2878)
```



# 7. 평가 데이터셋 추론

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

# ---------------------------------------------------------
# 1) Test 파일 전체를 메모리에 미리 적재 (속도 핵심)
# ---------------------------------------------------------
def load_all_test_files(test_meta, base_dir="Data"):
    cache = {}
    for path in tqdm(test_meta["path"].unique(), desc="Loading test files"):
        # path가 '/파일명.csv' 형태라서 앞 슬래시 제거
        clean = path[1:]
        full_path = base_dir + clean
        df = pd.read_csv(full_path)
        cache[path] = df
    return cache


# ---------------------------------------------------------
# 2) Inference
# ---------------------------------------------------------
model.load_state_dict(best_model_state)
model.eval()

test_meta = pd.read_csv("Data/test.csv")
submission = pd.read_csv("Data/sample_submission.csv")
submission = submission.merge(test_meta, on="game_episode", how="left")

# 파일 캐싱
file_cache = load_all_test_files(test_meta)

preds_x, preds_y = [], []

with torch.no_grad():
    for _, row in tqdm(submission.iterrows(), total=len(submission), desc="Inference"):
        g = file_cache[row["path"]]    # 캐시에서 즉시 가져오기 (매우 빠름)

        seq, _ = build_episode_sequence(g, for_train=False)

        # 방어 코드: seq가 None일 경우 fallback
        if seq is None:
            sx = g["start_x"].values[-1] / 105.0
            sy = g["start_y"].values[-1] / 68.0
            pred_norm = np.array([sx, sy], dtype="float32")
        else:
            x = torch.tensor(seq, dtype=torch.float32).unsqueeze(0).to(DEVICE)
            length = torch.tensor([seq.shape[0]]).to(DEVICE)

            pred = model(x, length)[0].cpu().numpy()
            pred_norm = pred

        preds_x.append(pred_norm[0] * 105.0)
        preds_y.append(pred_norm[1] * 68.0)

# 8. 제출 Submission 생성

In [None]:
submission["end_x"] = preds_x
submission["end_y"] = preds_y
submission[["game_episode", "end_x", "end_y"]].to_csv(
    "Data/baseline_step1_numeric_fe_fast.csv",
    index=False,
)

print("Saved: baseline_step1_numeric_fe_fast.csv")