In [1]:
import pandas as pd
df = pd.read_csv('train_set.csv')

In [2]:
# module directory 생성
import os
os.makedirs("module", exist_ok=True)

In [3]:
%%writefile module/train.py

import time
import torch
from sklearn.metrics import classification_report, confusion_matrix

def test_binary_classification(dataloader, model, loss_fn, device="cpu") -> tuple:
    """
    이진 분류 검증/평가 함수
    
    [parameter]
        dataloader: DataLoader - 검증할 대상 데이터로더
        model: 검증할 모델
        loss_fn: 모델 추정값과 정답의 차이를 계산할 loss 함수.
        device: str - 연산을 처리할 장치. default-"cpu", gpu-"cuda"
    [return]
        tuple: (loss, accuracy)
    """
    model.to(device)
    model.eval()  # 모델을 평가모드로 변환
    size = len(dataloader.dataset)
    num_steps = len(dataloader)
    
    test_loss = 0.0
    correct_predictions = 0
    
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            
            # 정확도 계산을 위한 예측값 비교
            pred_label = (pred >= 0.5).type(torch.int32)
            correct_predictions += (pred_label == y).sum().item()
        
        test_loss /= num_steps
        accuracy = correct_predictions / size
    
    return test_loss, accuracy


def train(dataloader, model, loss_fn, optimizer, device="cpu"):
    """
    모델을 1 epoch 학습시키는 함수

    [parameter]
        dataloader: DataLoader - 학습데이터셋을 제공하는 DataLoader
        model - 학습대상 모델
        loss_fn: 모델 추정값과 정답의 차이를 계산할 loss 함수.
        optimizer - 최적화 함수
        device: str - 연산을 처리할 장치. default-"cpu", gpu-"cuda"
    [return]
        float: 학습 후 계산한 Train set에 대한 train_loss
    """
    model.train()
    train_loss = 0.0
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        pred = model(X)

        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(dataloader)
    return train_loss


def fit(train_loader, val_loader, model, loss_fn, optimizer, epochs, save_best_model=True, save_model_path=None, device='cpu', mode='binary', lr_scheduler=None):
   
    best_val_loss = float('inf') if save_best_model else None
    best_epoch = 0
    best_model_state = None
    
    model = model.to(device)
    s = time.time()
    for epoch in range(epochs):
        train_loss = train(train_loader, model, loss_fn, optimizer, device=device)
        
        if lr_scheduler is not None:
            lr_scheduler.step()

        # 검증 손실 평가
        val_loss, val_accuracy = test_binary_classification(val_loader, model, loss_fn, device=device)

        # 최적의 모델 저장
        if save_best_model and val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch + 1  # 에포크는 1부터 시작하므로 +1
            best_model_state = model.state_dict()  # 최적 모델의 가중치 저장
            torch.save(model.state_dict(), save_model_path)
            print(f"Best Model 저장: Epoch {best_epoch} - Validation Loss: {best_val_loss:.5f}")

    e = time.time()
    print(f"총 소요 시간: {e-s:.2f}초")
    
    # 최적의 모델을 로드하여 최종 성능 평가
    model.load_state_dict(best_model_state)
    train_loss, train_accuracy = test_binary_classification(train_loader, model, loss_fn, device=device)
    test_loss, test_accuracy = test_binary_classification(val_loader, model, loss_fn, device=device)
    
    # 최적 모델 성능 출력
    print("\n최적의 체크포인트 모델:")
    print(f"Best Epoch: {best_epoch}")
    print("="*100)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print("="*100)
    
    return best_epoch, best_val_loss


Overwriting module/train.py


In [4]:
%%writefile module/utils.py
# 학습 결과를 시각화하는 함수.
import matplotlib.pyplot as plt

def plot_fit_result(train_loss_list, train_accuracy_list, valid_loss_list, valid_accuracy_list):
    epoch = len(train_loss_list)
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(range(epoch), train_loss_list, label="train loss")
    plt.plot(range(epoch), valid_loss_list, label="validation loss")
    plt.title("Loss")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.grid(True, linestyle=':')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(epoch), train_accuracy_list, label="train accuracy")
    plt.plot(range(epoch), valid_accuracy_list, label="validation accuracy")
    plt.title("Accuracy")
    plt.xlabel("epoch")
    plt.ylabel("accuracy")
    plt.grid(True, linestyle=':')
    plt.legend()

    plt.tight_layout()
    plt.show()

Overwriting module/utils.py


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

# X, y 데이터 분리
X = df.drop(columns=['Churn', 'customerID']).values
y = df['Churn'].values
y = y.reshape(-1, 1)
print(X.shape, y.shape)

# 데이터 train, test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE 적용 (소수 클래스 샘플을 다수 클래스와 동일하게 맞춤)
smote = SMOTE(sampling_strategy=1.0, random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Tensor로 변환
X_resampled = torch.tensor(X_resampled, dtype=torch.float32)
y_resampled = torch.tensor(y_resampled, dtype=torch.float32).view(-1,1)

# TensorDataset과 DataLoader 생성
trainset = TensorDataset(X_resampled, y_resampled)
testset = TensorDataset(
    torch.tensor(X_test_scaled, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.float32).view(-1,1)
)

train_loader = DataLoader(trainset, batch_size=128, shuffle=True, drop_last=True)
test_loader = DataLoader(testset, batch_size=128)
len(train_loader), len(test_loader)


(7043, 40) (7043, 1)


(64, 12)

In [6]:
epochs = 100
lr = 0.01

######### 모델 정의
class SmallModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.lr1 = nn.Linear(40, 32)
        self.lr2 = nn.Linear(32, 8)
        self.lr3 = nn.Linear(8, 1) 
        self.relu = nn.ReLU()
        self.logistic = nn.Sigmoid()  

    def forward(self, X):
        X = self.lr1(X)
        X = self.relu(X)
        X = self.lr2(X)
        X = self.relu(X)
        # 출력 Layer
        output = self.lr3(X)
        output = self.logistic(output)
        return output
        
small_model = SmallModel().to(device)

In [7]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(small_model.parameters(), lr=lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=5,
    gamma=0.5
)

In [8]:
from module.train import fit, test_binary_classification, train
from module.utils import plot_fit_result

# 모델 학습
best_epoch, best_val_loss = fit(
    train_loader, test_loader,  
    small_model, loss_fn, optimizer,
    epochs,
    save_best_model=True,
    save_model_path="best_model.pth",
    device=device,
    mode="binary",
    lr_scheduler=lr_scheduler
)


Best Model 저장: Epoch 1 - Validation Loss: 0.44345
Best Model 저장: Epoch 4 - Validation Loss: 0.44035
총 소요 시간: 19.48초

최적의 체크포인트 모델:
Best Epoch: 4
Train Loss: 0.3514, Train Accuracy: 0.8372
Test Loss: 0.5077, Test Accuracy: 0.7480


In [9]:
# 모델 구조를 먼저 정의한 후
model = SmallModel()  # 모델의 구조를 정의합니다.
model.load_state_dict(torch.load("best_model.pth"))  # 저장된 모델 가중치를 불러옵니다.
model.to(device)  # GPU나 CPU로 이동
model.eval()  # 모델을 평가 모드로 설정


SmallModel(
  (lr1): Linear(in_features=40, out_features=32, bias=True)
  (lr2): Linear(in_features=32, out_features=8, bias=True)
  (lr3): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU()
  (logistic): Sigmoid()
)

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

# X, y 데이터 분리
X = df.drop(columns=['Churn', 'customerID']).values
y = df['Churn'].values
y = y.reshape(-1, 1)
X.shape, y.shape

# 데이터 train, test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=0)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

trainset = TensorDataset(
    torch.tensor(X_train_scaled, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)
testset = TensorDataset(
    torch.tensor(X_test_scaled, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.float32)
)

train_loader = DataLoader(trainset, batch_size=128, shuffle=True, drop_last=True)
test_loader = DataLoader(testset, batch_size=128)
len(train_loader), len(test_loader)


(44, 12)

In [11]:
import torch
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_model(model, dataloader, loss_fn, device="cpu"):
    """
    모델을 평가하고 Confusion Matrix 및 Classification Report를 생성하는 함수

    [Parameters]
        model: nn.Module - 평가할 모델
        dataloader: DataLoader - 평가할 데이터셋의 DataLoader
        loss_fn: 손실 함수
        device: str - 연산을 처리할 장치 (default: "cpu")

    [Returns]
        tuple: (평균 손실, 정확도, Confusion Matrix, Classification Report)
    """
    model.to(device)
    model.eval()
    
    all_preds = []
    all_labels = []
    total_loss = 0.0
    num_batches = len(dataloader)
    
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = loss_fn(outputs, y)
            total_loss += loss.item()
            
            # 이진 분류일 경우 0.5 기준으로 클래스 분류
            pred_labels = (outputs >= 0.5).type(torch.int)
            all_preds.extend(pred_labels.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    
    # 평균 손실 계산
    avg_loss = float(total_loss / num_batches)
    accuracy = float(sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels))  # 정확도를 스칼라 값으로 계산
    
    # Confusion Matrix와 Classification Report 생성
    cm = confusion_matrix(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['Class 0', 'Class 1'])
    
    # Confusion Matrix와 Classification Report 출력
    print("Confusion Matrix:\n", cm)
    print("\nClassification Report:\n", report)
    print(f"\nAverage Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    return avg_loss, accuracy, cm, report


In [12]:
# 모델 불러오기
model = SmallModel()  # 모델 구조를 다시 정의
model.load_state_dict(torch.load("best_model.pth"))
model.to(device)

# 손실 함수 정의 (예: 이진 교차 엔트로피)
loss_fn = torch.nn.BCELoss()

# 모델 평가
test_loss, test_accuracy, cm, report = evaluate_model(
    model=model,
    dataloader=test_loader,
    loss_fn=loss_fn,
    device=device
)


Confusion Matrix:
 [[923 112]
 [177 197]]

Classification Report:
               precision    recall  f1-score   support

     Class 0       0.84      0.89      0.86      1035
     Class 1       0.64      0.53      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409


Average Loss: 0.3997, Accuracy: 0.7949


  accuracy = float(sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels))  # 정확도를 스칼라 값으로 계산
