In [1]:
!git clone https://github.com/e9t/nsmc.git

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Counting objects: 100% (14762/14762), done.[K
remote: Compressing objects: 100% (13012/13012), done.[K
remote: Total 14763 (delta 1748), reused 14762 (delta 1748), pack-reused 1[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 17.03 MiB/s, done.
Resolving deltas: 100% (1748/1748), done.
Updating files: 100% (14737/14737), done.


In [2]:
import pandas as pd

df = pd.read_csv("./nsmc/ratings.txt", delimiter='\t', quoting=3); df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        200000 non-null  int64 
 1   document  199992 non-null  object
 2   label     200000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


In [3]:
df.dropna(inplace = True)

결측치 존재, 결측치 제거하는 eda 진행

In [4]:
!pip install konlpy

from konlpy.tag import Okt
okt = Okt()
df['documnet'] = df['document'].map(lambda x: ' '.join(okt.morphs(x, stem = True)))

Collecting konlpy
  Using cached konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
Collecting lxml>=4.1.0
  Using cached lxml-5.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.0 MB)
Collecting JPype1>=0.7.0
  Using cached JPype1-1.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
Installing collected packages: lxml, JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0 lxml-5.1.0


위 코드에서 ‘ ‘.join을 하여 다시 하나의 문자열로 반환하는 이유-> 텍스트 정규화를 진행하기 위해서

온라인 텍스트 데이터 특성상 오탈자, 비문 등이 많이 존재합니다. 따라서 우선 토큰화 이후 띄어쓰기를 하면서 정규화 작업을 진행합니다. 해당 작업만으로도 1-2%의 성능 개선이 있다.

In [5]:
# 데이터 양이 많아 2만개의 데이터만 사용하자
df = pd.concat([df.iloc[:10000], df.iloc[-10000:]])

## 모델학습

In [6]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_texts = train_df['document'].astype(str).tolist() # 문자열 데이터로 명시 후 리스트 화
train_labels = train_df['label'].tolist()
test_texts = test_df['document'].astype(str).tolist()
test_labels = test_df['label'].tolist()

## BERT 토크나이저 불러오기

In [7]:
!pip install transformers

from transformers import BertTokenizer, BertForSequenceClassification

model_name = 'monologg/kobert'
tokenizer = BertTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
# truncation=True 너무 긴 입력을 모델에 맞게 자르겠다는 의미
# padding=True는 모든 입력이 같은 길이를 갖도록 패딩을 추가하겠다는 의미
test_encodings = tokenizer(test_texts, truncation=True, padding=True)



  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 51.0/51.0 [00:00<00:00, 4.40kB/s]
vocab.txt: 100%|██████████| 77.8k/77.8k [00:00<00:00, 446kB/s]
config.json: 100%|██████████| 426/426 [00:00<00:00, 209kB/s]


In [8]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

bert 모델 불러오기
분류를 위한 모델

In [9]:
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2) # 0, 1로 분류하기 때문에 레이블은 2개로 지정합니다.

model.safetensors: 100%|██████████| 369M/369M [00:06<00:00, 53.8MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 모델 훈련

In [10]:
from tqdm.auto import tqdm # 반복문이 얼마나 진행되었는지 알 수 있도록 프로그레스바를 표시합니다.

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # GPU 사용이 가능한 경우 설정

num_epochs = 10
learning_rate = 2e-5 #2e-5는 0.00002
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()
model.to(device) # GPU 사용이 가능한 경우

for epoch in range(num_epochs):
    model.train() # 훈련 모드 지정
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

100%|██████████| 250/250 [01:40<00:00,  2.49it/s]


Epoch 1/10 - Average Loss: 0.6729


100%|██████████| 250/250 [01:40<00:00,  2.50it/s]


Epoch 2/10 - Average Loss: 0.6453


100%|██████████| 250/250 [01:40<00:00,  2.49it/s]


Epoch 3/10 - Average Loss: 0.6264


100%|██████████| 250/250 [01:39<00:00,  2.50it/s]


Epoch 4/10 - Average Loss: 0.6092


100%|██████████| 250/250 [01:40<00:00,  2.49it/s]


Epoch 5/10 - Average Loss: 0.5877


100%|██████████| 250/250 [01:40<00:00,  2.49it/s]


Epoch 6/10 - Average Loss: 0.5658


100%|██████████| 250/250 [01:41<00:00,  2.47it/s]


Epoch 7/10 - Average Loss: 0.5626


100%|██████████| 250/250 [01:40<00:00,  2.49it/s]


Epoch 8/10 - Average Loss: 0.5281


100%|██████████| 250/250 [01:41<00:00,  2.47it/s]


Epoch 9/10 - Average Loss: 0.5077


100%|██████████| 250/250 [01:40<00:00,  2.49it/s]

Epoch 10/10 - Average Loss: 0.4860





## 모델 테스트

In [11]:
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(outputs.logits, dim=1)

        correct_predictions += torch.sum(predicted_labels == labels).item()
        total_predictions += labels.size(0)

accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.6212


## 모델추론

In [12]:
input_text = '이 영화 진짜 재밌다'
input_encoding = tokenizer.encode_plus(
    input_text,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

input_ids = input_encoding['input_ids'].to(device)
attention_mask = input_encoding['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    _, predicted_labels = torch.max(outputs.logits, dim=1)
predicted_labels = predicted_labels.item()

print(predicted_labels)

1
