# LSTM 은퇴연도 예측 모델 (개선된 버전)

이 노트북은 선수의 마지막 N시즌 데이터를 기반으로 은퇴 연도를 예측합니다.

In [None]:
# Colab 환경에서는 필요 시 아래 명령어 실행
# !pip install torch pandas scikit-learn

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# CSV 파일 업로드 (Colab 환경)
# from google.colab import files
# uploaded = files.upload()

In [None]:
# 데이터 로딩 및 전처리
df = pd.read_csv('kbo_retired.csv')  # Colab은 업로드 후 파일명 확인
features = ['AVG', 'SLG', 'OBP', 'G', 'AB', 'R', 'H', 'HR', 'RBI']
df[features] = df[features].replace('-', np.nan)
df = df.dropna(subset=features + ['season', 'name'])
df['player_id'] = df['name']
df = df.sort_values(['player_id', 'season'])
df[features] = MinMaxScaler().fit_transform(df[features])
# 은퇴 연도 추가
df['retire_season'] = df.groupby('player_id')['season'].transform('max')

In [None]:
# 시퀀스 생성 (마지막 N시즌 기반)
sequence_length = 5
sequences, targets = [], []
for player, group in df.groupby('player_id'):
    if len(group) < sequence_length:
        continue
    group = group.sort_values('season')
    seq = group.iloc[-sequence_length:][features].values
    target = group.iloc[-1]['retire_season']
    sequences.append(seq)
    targets.append(target)
X = np.array(sequences, dtype=np.float32)
y = np.array(targets, dtype=np.float32)

In [None]:
# 학습/검증 분할 및 Dataset 정의
from torch.utils.data import Dataset, DataLoader
class PlayerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y).unsqueeze(1)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

split_idx = int(len(X) * 0.8)
train_ds = PlayerDataset(X[:split_idx], y[:split_idx])
test_ds = PlayerDataset(X[split_idx:], y[split_idx:])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

In [None]:
# LSTM 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=32, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1])

In [None]:
# 모델 학습
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size=X.shape[2]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [None]:
# 예측 및 평가지표
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        pred = model(xb).cpu().numpy()
        preds.append(pred)
        actuals.append(yb.numpy())

preds = np.vstack(preds).flatten()
actuals = np.vstack(actuals).flatten()

mse = mean_squared_error(actuals, preds)
mae = mean_absolute_error(actuals, preds)
r2 = r2_score(actuals, preds)
print({"MSE": mse, "MAE": mae, "R2": r2})