# 📘 LSTM 은퇴 시기 예측 모델 (은퇴까지 남은 연도 수)
이 노트북은 선수의 최근 시즌 데이터를 기반으로 **은퇴까지 남은 연도 수**를 예측하는 LSTM 모델을 학습합니다.
- 입력: 최근 시즌별 성적 및 피처
- 출력: 해당 시즌 기준으로 은퇴까지 남은 연도

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# CSV 또는 XLSX 파일 업로드
# from google.colab import files
# uploaded = files.upload()
df = pd.read_excel('kbo_retired_with_retire_season.xlsx')

In [None]:
features = ['AVG', 'SLG', 'OBP', 'G', 'AB', 'R', 'H', 'HR', 'RBI']
df[features] = df[features].replace('-', np.nan)
df = df.dropna(subset=features + ['season', 'name', 'birth_year'])
df[['AVG', 'SLG', 'OBP']] = df[['AVG', 'SLG', 'OBP']].astype(float)

# 타겟: 은퇴까지 남은 연도
df['years_until_retirement'] = df['retire_season'] - df['season']
df = df[df['years_until_retirement'] >= 0]

# 피처 확장
df['age'] = df['season'] - df['birth_year']
df['career_length'] = df.groupby('name')['season'].transform('count')
df = df.sort_values(['name', 'season'])
df['avg_diff'] = df.groupby('name')['AVG'].diff()
df['slg_diff'] = df.groupby('name')['SLG'].diff()
df['obp_diff'] = df.groupby('name')['OBP'].diff()
df = df.dropna(subset=['avg_diff', 'slg_diff', 'obp_diff'])

# 정규화
full_features = features + ['age', 'career_length', 'avg_diff', 'slg_diff', 'obp_diff']
scaler = MinMaxScaler()
df[full_features] = scaler.fit_transform(df[full_features])

In [None]:
sequence_length = 5
sequences, targets = [], []
for player, group in df.groupby('name'):
    if len(group) < sequence_length:
        continue
    group = group.sort_values('season')
    for i in range(len(group) - sequence_length):
        seq = group.iloc[i:i+sequence_length][full_features].values
        target = group.iloc[i+sequence_length]['years_until_retirement']
        sequences.append(seq)
        targets.append(target)
X = np.array(sequences, dtype=np.float32)
y = np.array(targets, dtype=np.float32)

In [None]:
class PlayerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y).unsqueeze(1)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1])

In [None]:
split = int(len(X) * 0.8)
train_ds = PlayerDataset(X[:split], y[:split])
test_ds = PlayerDataset(X[split:], y[split:])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(input_size=X.shape[2]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [None]:
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        pred = model(xb).cpu().numpy()
        preds.append(pred)
        actuals.append(yb.numpy())
preds = np.vstack(preds).flatten()
actuals = np.vstack(actuals).flatten()
mse = mean_squared_error(actuals, preds)
mae = mean_absolute_error(actuals, preds)
r2 = r2_score(actuals, preds)
print({'MSE': mse, 'MAE': mae, 'R2': r2})