# Train

In [1]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import BertForSequenceClassification
from kobert_tokenizer import KoBERTTokenizer
from tqdm import tqdm

base_model = 'skt/kobert-base-v1'
MODEL_NAME = "./save/model_v1.pth"  # 불러올 모델 가중치
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터셋 클래스 정의
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 예제 학습 데이터프레임
df = pd.read_excel('./data/train_set.xlsx')

texts = df['question'].tolist()
df.label = df.label.map({'yes': 1, 'no': 0})
labels = df['label'].tolist()

max_len = 128
batch_size = 2

tokenizer = KoBERTTokenizer.from_pretrained(base_model)
dataset = SentimentDataset(texts, labels, tokenizer, max_len)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 모델 로드 및 가중치 초기화
model = BertForSequenceClassification.from_pretrained(base_model, num_labels=2)  # 모델의 아키텍처 설정
model_save_path = MODEL_NAME  # 가중치 불러오기
model.load_state_dict(torch.load(model_save_path))

# 학습 설정
model = model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

# 학습 함수 정의
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model = model.train()
    losses = 0
    correct_predictions = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, losses / n_examples

# 학습 실행
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        data_loader,
        optimizer,
        device,
        len(dataset)
    )
    print(f'Train loss {train_loss:.3f} accuracy {train_acc:.3f}')
    
# 모델 저장 경로 설정
def get_model_save_path(base_path, base_filename, ext):
    version = 1
    while True:
        model_save_path = f"{base_path}/{base_filename}_v{version}{ext}"
        if not os.path.exists(model_save_path):
            return model_save_path
        version += 1

# 모델 저장
base_path = './save'
base_filename = 'model'
ext = '.pth'

model_save_path = get_model_save_path(base_path, base_filename, ext)
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


100%|██████████| 379/379 [01:09<00:00,  5.43it/s]


Train loss 0.367 accuracy 0.475
Epoch 2/3
----------


100%|██████████| 379/379 [01:09<00:00,  5.45it/s]


Train loss 0.354 accuracy 0.515
Epoch 3/3
----------


100%|██████████| 379/379 [01:09<00:00,  5.45it/s]


Train loss 0.357 accuracy 0.484
Model saved to ./save/model_v3.pth


# Train-Valid

In [2]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import BertForSequenceClassification
from kobert_tokenizer import KoBERTTokenizer
from tqdm import tqdm

base_model = 'skt/kobert-base-v1'
MODEL_NAME = "./save/model_v1.pth"  # 불러올 모델 가중치
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터셋 클래스 정의
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 예제 학습 데이터프레임
train_df = pd.read_excel('./data/train_set.xlsx')
valid_df = pd.read_excel('./data/valid_label_set.xlsx')

train_texts = train_df['question'].tolist()
train_df.label = train_df.label.map({'yes': 1, 'no': 0})
train_labels = train_df['label'].tolist()

valid_texts = valid_df['question'].tolist()
valid_df.label = valid_df.label.map({'yes': 1, 'no': 0})
valid_labels = valid_df['label'].tolist()

max_len = 128
batch_size = 2

tokenizer = KoBERTTokenizer.from_pretrained(base_model)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_len)
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = SentimentDataset(valid_texts, valid_labels, tokenizer, max_len)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# 모델 로드 및 가중치 초기화
model = BertForSequenceClassification.from_pretrained(base_model, num_labels=2)  # 모델의 아키텍처 설정
model_save_path = MODEL_NAME  # 가중치 불러오기
model.load_state_dict(torch.load(model_save_path))

# 학습 설정
model = model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

# 학습 함수 정의
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model = model.train()
    losses = 0
    correct_predictions = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, losses / n_examples
  
# 검증 함수 정의
def evaluate(model, data_loader, device):
    model = model.eval()
    correct_predictions = 0
    losses = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            loss = loss_fn(logits, labels)

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses += loss.item()

    return correct_predictions.double() / len(data_loader.dataset), losses / len(data_loader.dataset)  
    
# 모델 저장 경로 설정
def get_model_save_path(base_path, base_filename, ext):
    version = 1
    while True:
        model_save_path = f"{base_path}/{base_filename}_v{version}{ext}"
        if not os.path.exists(model_save_path):
            return model_save_path
        version += 1

# 조기 종료 설정
early_stopping_patience = 3
best_accuracy = 0
epochs_without_improvement = 0

# 모델 저장 위치 설정
base_path = './save'
base_filename = 'model'
ext = '.pth'

# 학습 실행
epochs = 20
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        data_loader,
        optimizer,
        device,
        len(dataset)
    )
    print(f'Train loss {train_loss:.3f} Train accuracy {train_acc:.3f}')
    
    val_acc, val_loss = evaluate(model, valid_loader, device)
    print(f'Validation loss: {val_loss:.3f}, Validation accuracy: {val_acc:.3f}')

    # 모델 저장
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        epochs_without_improvement = 0
        model_save_path = get_model_save_path(base_path, base_filename, ext)
        torch.save(model.state_dict(), model_save_path)
        print(f'Model saved to {model_save_path}')
        
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= early_stopping_patience:
        print("Early stopping")
        break

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
----------


100%|██████████| 379/379 [01:09<00:00,  5.44it/s]


Train loss 0.361 accuracy 0.497
Validation loss: 0.352, accuracy: 0.500
Model saved to ./save/model_v4.pth
Epoch 2/20
----------


100%|██████████| 379/379 [01:09<00:00,  5.47it/s]


Train loss 0.358 accuracy 0.496
Validation loss: 0.347, accuracy: 0.500
Epoch 3/20
----------


100%|██████████| 379/379 [01:09<00:00,  5.43it/s]


Train loss 0.358 accuracy 0.489
Validation loss: 0.347, accuracy: 0.500
Epoch 4/20
----------


100%|██████████| 379/379 [01:09<00:00,  5.45it/s]


Train loss 0.355 accuracy 0.511
Validation loss: 0.361, accuracy: 0.500
Early stopping


# Valid - label 없는 것 붙이기

In [8]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import pandas as pd
from transformers import BertForSequenceClassification
from kobert_tokenizer import KoBERTTokenizer
from tqdm import tqdm

base_model = 'skt/kobert-base-v1'
MODEL_NAME = "./save/model_v4.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 검증할 데이터 정의
class QuestionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }
        
# 모델 불러오기
model = BertForSequenceClassification.from_pretrained(base_model, num_labels=2)  # 모델의 아키텍처 설정
model_save_path = MODEL_NAME  # 가중치 불러오기
model.load_state_dict(torch.load(model_save_path))
model = model.to(device)

# 검증 데이터 준비
validation_df = pd.read_excel('./data/valid_set.xlsx')
validation_texts = validation_df['question'].tolist()

max_len = 128
batch_size = 2

tokenizer = KoBERTTokenizer.from_pretrained(base_model)
validation_dataset = QuestionDataset(validation_texts, tokenizer, max_len)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

# 예측 함수 정의
def predict(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return predictions

# 예측 수행 및 결과 출력
predictions = predict(model, validation_loader, device)

for text, pred in zip(validation_texts, predictions):
    label = "yes" if pred == 1 else "no"
    print(f"문장: {text} -> 예측된 레이블: {label}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
100%|██████████| 25/25 [00:01<00:00, 24.42it/s]

문장: 파이썬에서 리스트와 튜플의 차이는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬에서 딕셔너리를 어떻게 정의하나요? -> 예측된 레이블: no
문장: 파이썬에서 클래스는 어떻게 생성하나요? -> 예측된 레이블: no
문장: 파이썬에서 파일을 읽고 쓰는 방법은 무엇인가요? -> 예측된 레이블: no
문장: 파이썬의 주요 내장 함수들은 무엇이 있나요? -> 예측된 레이블: no
문장: 파이썬에서 for 루프의 기본 구조는 어떻게 되나요? -> 예측된 레이블: no
문장: 파이썬에서 예외 처리는 어떻게 하나요? -> 예측된 레이블: no
문장: 파이썬에서 패키지를 설치하는 명령어는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬의 주요 데이터 타입에는 무엇이 있나요? -> 예측된 레이블: no
문장: 파이썬에서 함수는 어떻게 정의하나요? -> 예측된 레이블: no
문장: 파이썬에서 리스트의 요소를 정렬하는 방법은 무엇인가요? -> 예측된 레이블: no
문장: 파이썬에서 문자열 포매팅은 어떻게 하나요? -> 예측된 레이블: no
문장: 파이썬에서 lambda 함수는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬에서 모듈과 패키지의 차이는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬에서 글로벌 변수와 로컬 변수의 차이는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬의 주요 표준 라이브러리에는 무엇이 있나요? -> 예측된 레이블: no
문장: 파이썬에서 set 자료형은 어떻게 사용하나요? -> 예측된 레이블: no
문장: 파이썬에서 map 함수는 어떻게 사용하나요? -> 예측된 레이블: no
문장: 파이썬에서 데코레이터는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬에서 제너레이터는 무엇인가요? -> 예측된 레이블: no
문장: 파이썬에서 리스트 컴프리헨션은 어떻게 사용하나요? -> 예측된 레이블: no
문장: 파이썬에서 가비지 컬렉션은 어떻게 동작하나요? -> 예측된 레이블: no
문장: 파이썬에서 멀티스레딩을 구현하는 


