# 1. 라이브러리 설치

In [None]:
'''!pip install torch
!pip install transformers
!pip install numpy'''

In [20]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from transformers import AutoModel, AutoTokenizer, AdamW
from transformers import get_linear_schedule_with_warmup
import warnings
import os
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 2. 데이터 로드

Task 1. 과제 Task 1에서 만든 파일을 본인의 구글 드라이브에 올린 후 파일의 URL을 Data URLs에 각각 맞춰 업데이트 해주세요.
기존에 적혀진 URL은 테스트를 위해 적은 양의 데이터만을 가진 것입니다.

In [21]:
'''# Task 1
# Data URLs
training_data_url = "https://drive.google.com/file/d/1qCQg3_4ZCAKy0duN51sKCNdcdvmYNnTo/view?usp=sharing"
valid_data_url = "https://drive.google.com/file/d/1rrQkVPkP2XSrGAlljhWs9VKAlSizDc3O/view?usp=sharing"
test_data_url = "https://drive.google.com/file/d/1z4GSfOABgyyX7Lp1oPQp2G33T3F6Majx/view?usp=sharing"

# Training Data
training_data_file_id = training_data_url.split("/")[-2]
!gdown $training_data_file_id

# Validation Data
valid_data_file_id = valid_data_url.split("/")[-2]
!gdown $valid_data_file_id

# Test Data
test_data_file_id = test_data_url.split("/")[-2]
!gdown $test_data_file_id'''

'# Task 1\n# Data URLs\ntraining_data_url = "https://drive.google.com/file/d/1qCQg3_4ZCAKy0duN51sKCNdcdvmYNnTo/view?usp=sharing"\nvalid_data_url = "https://drive.google.com/file/d/1rrQkVPkP2XSrGAlljhWs9VKAlSizDc3O/view?usp=sharing"\ntest_data_url = "https://drive.google.com/file/d/1z4GSfOABgyyX7Lp1oPQp2G33T3F6Majx/view?usp=sharing"\n\n# Training Data\ntraining_data_file_id = training_data_url.split("/")[-2]\n!gdown $training_data_file_id\n\n# Validation Data\nvalid_data_file_id = valid_data_url.split("/")[-2]\n!gdown $valid_data_file_id\n\n# Test Data\ntest_data_file_id = test_data_url.split("/")[-2]\n!gdown $test_data_file_id'

In [22]:
train = pd.read_csv("./processed_data/train.csv")
valid = pd.read_csv("./processed_data/valid.csv")
test = pd.read_csv("./processed_data/test.csv")

In [23]:
print(train.shape)
print(valid.shape)
print(test.shape)

(36744, 2)
(4083, 2)
(5122, 2)


In [24]:
train.head()

Unnamed: 0,sentence,label
0,직장에 새로운 신입사원이 입사를 했는데 알려줄게 너무 많아.,0
1,집 앞에 슈퍼를 갈 때도 나를 자꾸 데려가.,0
2,취직한 줄 알았던 아들이 알고 보니 백수였어. 오늘 피시방에서 놀고 있는 걸 발견했어.,3
3,내 악성 빈혈이 우리 손녀한테 유전되는 게 아닐까 걱정되네.,4
4,내 건강에 관련하여 고민이 되는 일이 있어.,2


In [25]:
device = torch.device("cuda:0")
print(device)

cuda:0


In [44]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060'

# 3. 모델 정의

In [26]:
roberta = AutoModel.from_pretrained("klue/roberta-small")
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

Task 2. classifier에 맞는 내용을 코딩해주세요.

Hint: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html 

In [27]:
class RoBERTaClassifier(nn.Module):
    def __init__(self, roberta, hidden_size=768, num_classes=2):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = roberta
        self.classifier = nn.Linear(hidden_size, num_classes) # Task 2

    def forward(self, input_ids, attention_masks):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_masks)
        return self.classifier(outputs.pooler_output)

In [28]:
model = RoBERTaClassifier(roberta=roberta).to(device)

# 4. 데이터셋 정의

Task 3. ``document`` 변수에 적합한 코드를 작성하세요.

Hint: ``self.labels`` 코드를 참고하세요.

In [29]:
class KERDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = dataset
        document = self.dataset['sentence'].tolist() # Task 3
        inputs = tokenizer(document, padding=True)
        self.input_ids = inputs['input_ids']
        self.attention_masks = inputs['attention_mask']
        self.labels = self.dataset['label'].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_masks[idx], self.labels[idx])       

In [30]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    return torch.LongTensor(input_ids), torch.LongTensor(attention_masks), torch.FloatTensor(labels)

In [31]:
train_ds = KERDataset(train, tokenizer)
valid_ds = KERDataset(valid, tokenizer)
test_ds = KERDataset(test, tokenizer)

In [32]:
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
log_interval = 400
learning_rate =  5e-5

In [39]:
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)
valid_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

In [40]:
total_steps = len(train_dataloader) * num_epochs
warmup_step = int(total_steps * warmup_ratio)

In [41]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_step,
                                            num_training_steps = total_steps)
loss_fn = nn.CrossEntropyLoss()

# 5. Training

In [42]:
def calc_accuracy(X,Y):
    correct = 0
    X, Y = X.tolist(), Y.tolist()
    for pred, label in zip(X, Y):
        if pred.index(max(pred)) == label.index(max(label)):
            correct += 1
    train_acc = correct/len(X)
    return train_acc

In [45]:
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    for batch_id, (input_ids, attention_masks, labels) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        out = model(input_ids=input_ids, attention_masks=attention_masks)
        labels = labels.tolist()
        labels = torch.FloatTensor([[0, 1] if l == 0 else [1, 0] for l in labels]).to(device)
        loss = loss_fn(out, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, labels)
        
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (input_ids, attention_masks, labels) in enumerate(tqdm_notebook(valid_dataloader)):
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = torch.FloatTensor([[0, 1] if l == 0 else [1, 0] for l in labels]).to(device)
        out = model(input_ids=input_ids, attention_masks=attention_masks)
        valid_acc += calc_accuracy(out, labels)
    print("epoch {} validation acc {}".format(e+1, valid_acc / (batch_id+1)))

  0%|          | 1/1149 [00:00<10:52,  1.76it/s]

epoch 1 batch id 1 loss 0.4483993351459503 train acc 0.75


 35%|███▍      | 401/1149 [02:20<04:11,  2.97it/s]

epoch 1 batch id 401 loss 0.1895814836025238 train acc 0.8890274314214464


 70%|██████▉   | 801/1149 [04:34<01:57,  2.96it/s]

epoch 1 batch id 801 loss 0.2103106677532196 train acc 0.8901763420724095


100%|██████████| 1149/1149 [06:31<00:00,  2.94it/s]

epoch 1 train acc 0.8902850304612707





  0%|          | 0/128 [00:00<?, ?it/s]

epoch 1 validation acc 0.890856291118421


  0%|          | 1/1149 [00:00<06:24,  2.99it/s]

epoch 2 batch id 1 loss 0.37695440649986267 train acc 0.78125


 35%|███▍      | 401/1149 [02:15<04:15,  2.93it/s]

epoch 2 batch id 401 loss 0.12329535186290741 train acc 0.9089775561097256


 70%|██████▉   | 801/1149 [04:32<01:57,  2.97it/s]

epoch 2 batch id 801 loss 0.17675809562206268 train acc 0.9079666042446941


100%|██████████| 1149/1149 [06:28<00:00,  2.96it/s]

epoch 2 train acc 0.9082354221061792





  0%|          | 0/128 [00:00<?, ?it/s]

epoch 2 validation acc 0.884752775493421


  0%|          | 1/1149 [00:00<06:23,  2.99it/s]

epoch 3 batch id 1 loss 0.22549401223659515 train acc 0.875


 35%|███▍      | 401/1149 [02:14<04:10,  2.98it/s]

epoch 3 batch id 401 loss 0.1414034217596054 train acc 0.9377337905236908


 70%|██████▉   | 801/1149 [04:29<01:56,  2.98it/s]

epoch 3 batch id 801 loss 0.08058863878250122 train acc 0.9342618601747815


100%|██████████| 1149/1149 [06:26<00:00,  2.97it/s]

epoch 3 train acc 0.9337467362924282





  0%|          | 0/128 [00:00<?, ?it/s]

epoch 3 validation acc 0.879381681743421


  0%|          | 1/1149 [00:00<06:23,  3.00it/s]

epoch 4 batch id 1 loss 0.15516173839569092 train acc 0.9375


 35%|███▍      | 401/1149 [02:14<04:10,  2.98it/s]

epoch 4 batch id 401 loss 0.08782632648944855 train acc 0.9569825436408977


 70%|██████▉   | 801/1149 [04:29<01:58,  2.94it/s]

epoch 4 batch id 801 loss 0.18519358336925507 train acc 0.954939138576779


100%|██████████| 1149/1149 [06:28<00:00,  2.96it/s]

epoch 4 train acc 0.9550424281984334





  0%|          | 0/128 [00:00<?, ?it/s]

epoch 4 validation acc 0.882555509868421


  0%|          | 1/1149 [00:00<06:28,  2.96it/s]

epoch 5 batch id 1 loss 0.16212239861488342 train acc 0.9375


 35%|███▍      | 401/1149 [02:14<04:09,  3.00it/s]

epoch 5 batch id 401 loss 0.0511215403676033 train acc 0.9680486284289277


 70%|██████▉   | 801/1149 [04:28<01:58,  2.94it/s]

epoch 5 batch id 801 loss 0.18836916983127594 train acc 0.9657069288389513


100%|██████████| 1149/1149 [06:25<00:00,  2.98it/s]

epoch 5 train acc 0.9645343777197564





  0%|          | 0/128 [00:00<?, ?it/s]

epoch 5 validation acc 0.8870271381578947


# 6. Test

In [46]:
test_acc = 0.0
model.eval()
for batch_id, (input_ids, attention_masks, labels) in enumerate(tqdm_notebook(test_dataloader)):
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    labels = torch.FloatTensor([[0, 1] if l == 0 else [1, 0] for l in labels]).to(device)
    out = model(input_ids=input_ids, attention_masks=attention_masks)
    test_acc += calc_accuracy(out, labels)
print("Test acc : {}".format(test_acc / (batch_id+1)))

  0%|          | 0/161 [00:00<?, ?it/s]

Test acc : 0.8852872670807453


Task 4. 기존 코드에서 문제점/아쉬운 점들에 대해서 논하세요. 

해당 내용은 여기 코드 및 결과와 함께 보고서로 작성해서 제출하세요.

해당 문제점에 대한 해결 방안 및 이를 실제로 적용한 코드 작성 및 결과를 첨부한 경우 가산점이 있습니다.