# Neural Network - v2

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lg-aimers-v2/sample_submission.csv
/kaggle/input/lg-aimers-v2/train_v2.csv
/kaggle/input/lg-aimers-v2/test_v2.csv


In [2]:
submission = pd.read_csv('/kaggle/input/lg-aimers-v2/sample_submission.csv', index_col= 'ID')
train_v2 = pd.read_csv('/kaggle/input/lg-aimers-v2/train_v2.csv', index_col=None)
test_v2 = pd.read_csv('/kaggle/input/lg-aimers-v2/test_v2.csv', index_col = None)

## 1. 데이터 나누기

In [3]:
X = train_v2.drop('임신 성공 여부', axis= 1)
X_test = test_v2

y = train_v2['임신 성공 여부'].values

## 2. Neural Network 훈련

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset

In [5]:
# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
# PyTorch Dataset 생성
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),  # Dropout 증가
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            
            nn.Linear(128, 1),  # 이진 분류: 출력층은 1개 뉴런
            nn.Sigmoid()        # 이진 분류 확률 출력
        )
        
    def forward(self, x):  
        return self.model(x)

In [8]:
# 데이터 변환
X_tensor = np.array(X)  # CSR Matrix → Dense 변환 (필요 시)
y_tensor = np.array(y)

X_test_tensor = np.array(X_test)
y_tensor = y_tensor.astype(int)

# K-Fold 설정
n_splits = 10
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1991)

# OOF 예측 저장용
oof_val_preds = np.zeros(X_tensor.shape[0])
oof_test_preds = np.zeros(X_test_tensor.shape[0])

In [9]:
from torch.optim.lr_scheduler import OneCycleLR

# K-Fold 훈련
for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(X_tensor, y_tensor)):
    print(f"\n#### Fold {fold_idx+1}/{n_splits} ####")

    # 훈련/검증 데이터 설정
    X_train, y_train = X_tensor[train_idx], y_tensor[train_idx]
    X_valid, y_valid = X_tensor[valid_idx], y_tensor[valid_idx]

    # NaN 값 0으로 채우기 (필요하면 평균값으로 대체 가능)
    X_train = np.nan_to_num(X_train, nan=0.0)
    X_valid = np.nan_to_num(X_valid, nan=0.0)

    train_dataset = CustomDataset(X_train, y_train)
    valid_dataset = CustomDataset(X_valid, y_valid)
    
    train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=1024, shuffle=False)

    # 모델 초기화
    model = NeuralNetwork(input_dim=X_train.shape[1]).to(device)

    # 최적화 알고리즘 & 스케줄러 설정
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

    # `total_steps`를 정확하게 설정해야 ValueError 방지 가능!
    total_steps = len(train_loader) * 1000
    scheduler = OneCycleLR(optimizer, max_lr=0.003, total_steps=total_steps, pct_start=0.1)

    criterion = nn.BCELoss()  
    best_auc = 0.0  # AUC 기준 Early Stopping

    # Early Stopping 설정
    best_valid_loss = np.inf
    patience, patience_counter = 20, 0

    # 학습 진행
    for epoch in range(1000):
        model.train()
        train_loss = 0.0

        for X_batch, y_batch in train_loader:  
            X_batch, y_batch = X_batch.to(device), y_batch.to(device).unsqueeze(1)
    
            optimizer.zero_grad()
            preds = model(X_batch)
    
            loss = criterion(preds, y_batch)  # Loss 계산
            loss.backward()  # 역전파
            optimizer.step()  # 가중치 업데이트
    
            train_loss += loss.item()
            scheduler.step() # lr update

        # 검증 데이터 평가
        model.eval()
        valid_loss, valid_preds = 0.0, []
        with torch.no_grad():
            for X_batch, y_batch in valid_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device).unsqueeze(1)
                preds = model(X_batch)
                loss = criterion(preds, y_batch)
                valid_loss += loss.item()
                valid_preds.extend(preds.cpu().numpy())

        valid_loss /= len(valid_loader)
        train_loss /= len(train_loader)
        roc_auc = roc_auc_score(y_valid, np.array(valid_preds).flatten())

        # 10 에폭마다 출력 (첫 에폭 포함)
        if (epoch + 1) % 10 == 0 or epoch == 0:
            current_lr = optimizer.param_groups[0]['lr']  # 현재 Learning Rate 확인
            print(f"Epoch {epoch+1}: Train Loss = {train_loss}, Valid Loss = {valid_loss:}, AUC = {roc_auc}, LR = {current_lr}")

        # AUC & Loss 기준 Best Model 저장
        if roc_auc > best_auc or valid_loss < best_valid_loss:
            best_auc = max(best_auc, roc_auc)
            best_valid_loss = min(best_valid_loss, valid_loss)
            patience_counter = 0
            best_model_state = model.state_dict()
            best_iteration = epoch + 1  # Best Iteration 저장
        else:
            patience_counter += 1

        # Early Stopping
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            print(f"Best iteration: [{best_iteration}]  Valid Loss: {best_valid_loss}  AUC: {best_auc}")
            break

    # 최적 모델 로드
    model.load_state_dict(best_model_state)

    # 검증 데이터 예측 저장
    model.eval()
    valid_preds = []
    with torch.no_grad():
        for X_batch in valid_loader:
            X_batch = X_batch[0].to(device)  
            preds = model(X_batch)
            valid_preds.extend(preds.cpu().numpy())

    valid_preds = np.array(valid_preds).flatten()
    
    print(f"Fold {fold_idx+1} Valid Preds Min/Max: {valid_preds.min()}, {valid_preds.max()}")
    print(f"Fold {fold_idx+1} Valid AUC: {roc_auc_score(y_valid, valid_preds)}")

    # OOF 예측 저장
    oof_val_preds[valid_idx] = valid_preds

    # 테스트 데이터 예측 저장
    test_dataset = CustomDataset(X_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

    test_preds = []
    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch)
            test_preds.extend(preds.cpu().numpy())

    oof_test_preds += np.array(test_preds).flatten() / n_splits


#### Fold 1/10 ####
Epoch 1: Train Loss = 0.5837600711172661, Valid Loss = 0.532318305510741, AUC = 0.7065160300732434, LR = 0.00012071061595315047
Epoch 10: Train Loss = 0.49394247278702996, Valid Loss = 0.4916591059703093, AUC = 0.7329372230586543, LR = 0.0001904848026045092
Epoch 20: Train Loss = 0.4901525478447433, Valid Loss = 0.4888721475234398, AUC = 0.7366561356626078, LR = 0.0003950390612538755
Epoch 30: Train Loss = 0.48892385835668684, Valid Loss = 0.4887977930215689, AUC = 0.7366427941866328, LR = 0.000713637822416415
Epoch 40: Train Loss = 0.4879334380405139, Valid Loss = 0.48921283047932845, AUC = 0.7373853227675091, LR = 0.0011150916822163179
Epoch 50: Train Loss = 0.4873905531336776, Valid Loss = 0.4885863077182036, AUC = 0.7374043383414507, LR = 0.0015601000905663418
Early stopping at epoch 55
Best iteration: [35]  Valid Loss: 0.48814702607118166  AUC: 0.7382028296492291
Fold 1 Valid Preds Min/Max: 4.823295540745676e-08, 0.6226266026496887
Fold 1 Valid AUC: 0.73805898

## 3. 최종 결과

In [10]:
# 최종 성능 평가
final_auc = roc_auc_score(y_tensor, oof_val_preds)
print(f"NN OOF 검증 데이터 ROC-AUC: {final_auc}")

NN OOF 검증 데이터 ROC-AUC: 0.7376636074816705


In [11]:
oof_test_preds

array([2.01244410e-04, 1.18649388e-04, 1.58065354e-01, ...,
       4.76878986e-01, 2.94966999e-01, 1.86868522e-04])

In [12]:
# OOF 검증 예측 결과 저장
oof_val_preds_df = pd.DataFrame({'ID': train_v2.index, 'oof_val_preds': oof_val_preds})
oof_val_preds_df.to_csv("nn_v2_oof_val_predictions.csv", index=False)

# OOF 테스트 예측 결과 저장
oof_test_preds_df = pd.DataFrame({'ID': test_v2.index, 'oof_test_preds': oof_test_preds})
oof_test_preds_df.to_csv("nn_v2_oof_test_predictions.csv", index=False)

In [13]:
val = pd.read_csv('/kaggle/working/nn_v2_oof_val_predictions.csv', index_col = 0)
val

Unnamed: 0_level_0,oof_val_preds
ID,Unnamed: 1_level_1
0,0.486373
1,0.000001
2,0.325161
3,0.000125
4,0.288371
...,...
256346,0.357125
256347,0.276496
256348,0.326369
256349,0.157279


In [14]:
te = pd.read_csv('/kaggle/working/nn_v2_oof_test_predictions.csv', index_col = 0)
te

Unnamed: 0_level_0,oof_test_preds
ID,Unnamed: 1_level_1
0,0.000201
1,0.000119
2,0.158065
3,0.106845
4,0.529263
...,...
90062,0.000609
90063,0.321394
90064,0.476879
90065,0.294967


In [15]:
# 결과 저장
submission["probability"] = oof_test_preds
submission.to_csv("v2_nn_submission.csv", index=False)