In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from torch import optim, nn

# 하이퍼파라미터 및 환경 설정
epochs = 5
batch_size = 16
device = "cuda" if torch.cuda.is_available() else "cpu"

# 엑셀 데이터 불러오기
excel_path = "대학생의 디지털 구독 서비스 관련 인식조사 데이터 정리_2024.04.18(구분).xlsx"
df = pd.read_excel(excel_path, sheet_name="응답정보(항목)")

# 텍스트 전처리
text_df = df[["다양한 구독서비스 이용 이유"]].dropna()
text_df = text_df[text_df["다양한 구독서비스 이용 이유"].str.strip() != ""].copy()
text_df.rename(columns={"다양한 구독서비스 이용 이유": "text"}, inplace=True)
text_df["label"] = 1  # 더미 라벨

# 데이터 분할
text_df = text_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, valid_df, test_df = np.split(text_df, [int(0.6 * len(text_df)), int(0.8 * len(text_df))])

# BERT 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=False)

# 텐서 데이터셋 생성
def make_dataset(df, tokenizer, device):
    if not isinstance(df, pd.DataFrame):
        raise TypeError("입력은 pandas DataFrame이어야 합니다.")
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError("DataFrame에 'text' 또는 'label' 열이 없습니다.")
    
    tokenized = tokenizer(
        text=df["text"].tolist(),
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels = torch.tensor(df["label"].values, dtype=torch.long).to(device)
    return TensorDataset(input_ids, attention_mask, labels)

# 데이터로더 생성
def get_dataloader(dataset, sampler, batch_size):
    return DataLoader(dataset, sampler=sampler(dataset), batch_size=batch_size)

# 정확도 계산
def calc_accuracy(preds, labels):
    pred_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()
    return (pred_flat == labels_flat).cpu().numpy().mean()

# 학습 함수
def train(model, optimizer, dataloader):
    model.train()
    total_loss = 0.0
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

# 평가 함수
def evaluation(model, dataloader):
    model.eval()
    total_loss, total_accuracy = 0.0, 0.0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_loss += loss.item()
            total_accuracy += calc_accuracy(logits, labels)
    return total_loss / len(dataloader), total_accuracy / len(dataloader)

# 데이터셋 및 데이터로더 생성
train_dataset = make_dataset(train_df, tokenizer, device)
valid_dataset = make_dataset(valid_df, tokenizer, device)
test_dataset = make_dataset(test_df, tokenizer, device)

train_loader = get_dataloader(train_dataset, RandomSampler, batch_size)
valid_loader = get_dataloader(valid_dataset, SequentialSampler, batch_size)
test_loader = get_dataloader(test_dataset, SequentialSampler, batch_size)

# 모델 선언
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)

# 학습 루프
best_loss = float("inf")
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_loader)
    val_loss, val_acc = evaluation(model, valid_loader)
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "best_bert_model.pt")
        print("Best model saved.")

# 테스트 평가
model.load_state_dict(torch.load("best_bert_model.pt"))
test_loss, test_acc = evaluation(model, test_loader)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss = 0.2790, Val Loss = 0.0505, Val Acc = 1.0000
Best model saved.
Epoch 2: Train Loss = 0.0334, Val Loss = 0.0138, Val Acc = 1.0000
Best model saved.
Epoch 3: Train Loss = 0.0171, Val Loss = 0.0075, Val Acc = 1.0000
Best model saved.
Epoch 4: Train Loss = 0.0101, Val Loss = 0.0052, Val Acc = 1.0000
Best model saved.
Epoch 5: Train Loss = 0.0062, Val Loss = 0.0040, Val Acc = 1.0000
Best model saved.
Test Loss: 0.0040
Test Accuracy: 1.0000
