# 1. 라이브러리 설치

In [None]:
'''!pip install torch
!pip install transformers
!pip install numpy'''

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from transformers import AutoModel, AutoTokenizer, AdamW
from transformers import get_linear_schedule_with_warmup
import warnings
import os
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


# 2. 데이터 로드

Task 1. 과제 Task 1에서 만든 파일을 본인의 구글 드라이브에 올린 후 파일의 URL을 Data URLs에 각각 맞춰 업데이트 해주세요.
기존에 적혀진 URL은 테스트를 위해 적은 양의 데이터만을 가진 것입니다.

In [15]:
'''# Task 1
# Data URLs
training_data_url = "https://drive.google.com/file/d/1qCQg3_4ZCAKy0duN51sKCNdcdvmYNnTo/view?usp=sharing"
valid_data_url = "https://drive.google.com/file/d/1rrQkVPkP2XSrGAlljhWs9VKAlSizDc3O/view?usp=sharing"
test_data_url = "https://drive.google.com/file/d/1z4GSfOABgyyX7Lp1oPQp2G33T3F6Majx/view?usp=sharing"

# Training Data
training_data_file_id = training_data_url.split("/")[-2]
!gdown $training_data_file_id

# Validation Data
valid_data_file_id = valid_data_url.split("/")[-2]
!gdown $valid_data_file_id

# Test Data
test_data_file_id = test_data_url.split("/")[-2]
!gdown $test_data_file_id'''

'# Task 1\n# Data URLs\ntraining_data_url = "https://drive.google.com/file/d/1qCQg3_4ZCAKy0duN51sKCNdcdvmYNnTo/view?usp=sharing"\nvalid_data_url = "https://drive.google.com/file/d/1rrQkVPkP2XSrGAlljhWs9VKAlSizDc3O/view?usp=sharing"\ntest_data_url = "https://drive.google.com/file/d/1z4GSfOABgyyX7Lp1oPQp2G33T3F6Majx/view?usp=sharing"\n\n# Training Data\ntraining_data_file_id = training_data_url.split("/")[-2]\n!gdown $training_data_file_id\n\n# Validation Data\nvalid_data_file_id = valid_data_url.split("/")[-2]\n!gdown $valid_data_file_id\n\n# Test Data\ntest_data_file_id = test_data_url.split("/")[-2]\n!gdown $test_data_file_id'

In [7]:
train = pd.read_csv("./processed_data/train.csv")
valid = pd.read_csv("./processed_data/valid.csv")
test = pd.read_csv("./processed_data/test.csv")

In [8]:
print(train.shape)
print(valid.shape)
print(test.shape)

(36744, 2)
(4083, 2)
(5122, 2)


In [9]:
train.head()

Unnamed: 0,sentence,label
0,직장에 새로운 신입사원이 입사를 했는데 알려줄게 너무 많아.,0
1,집 앞에 슈퍼를 갈 때도 나를 자꾸 데려가.,0
2,취직한 줄 알았던 아들이 알고 보니 백수였어. 오늘 피시방에서 놀고 있는 걸 발견했어.,3
3,내 악성 빈혈이 우리 손녀한테 유전되는 게 아닐까 걱정되네.,4
4,내 건강에 관련하여 고민이 되는 일이 있어.,2


In [13]:
device = torch.device("cuda:0")
print(device)

cuda:0


# 3. 모델 정의

In [16]:
roberta = AutoModel.from_pretrained("klue/roberta-small")
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Task 2. classifier에 맞는 내용을 코딩해주세요.

Hint: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html 

In [17]:
class RoBERTaClassifier(nn.Module):
    def __init__(self, roberta, hidden_size=768, num_classes=2):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = roberta
        self.classifier = None # Task 2

    def forward(self, input_ids, attention_masks):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_masks)
        return self.classifier(outputs.pooler_output)

In [18]:
model = RoBERTaClassifier(roberta=roberta).to(device)

# 4. 데이터셋 정의

Task 3. ``document`` 변수에 적합한 코드를 작성하세요.

Hint: ``self.labels`` 코드를 참고하세요.

In [None]:
class KERDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = dataset
        document = None # Task 3
        inputs = tokenizer(document, padding=True)
        self.input_ids = inputs['input_ids']
        self.attention_masks = inputs['attention_mask']
        self.labels = self.dataset['label'].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_masks[idx], self.labels[idx])       

In [None]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    return torch.LongTensor(input_ids), torch.LongTensor(attention_masks), torch.FloatTensor(labels)

In [None]:
train_ds = KERDataset(train, tokenizer)
valid_ds = KERDataset(valid, tokenizer)
test_ds = KERDataset(test, tokenizer)

In [None]:
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
log_interval = 400
learning_rate =  5e-5

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, num_workers=5, collate_fn=collate_fn)
valid_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=batch_size, num_workers=5, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, num_workers=5, collate_fn=collate_fn)

In [None]:
total_steps = len(train_dataloader) * num_epochs
warmup_step = int(total_steps * warmup_ratio)

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_step,
                                            num_training_steps = total_steps)
loss_fn = nn.CrossEntropyLoss()

# 5. Training

In [None]:
def calc_accuracy(X,Y):
    correct = 0
    X, Y = X.tolist(), Y.tolist()
    for pred, label in zip(X, Y):
        if pred.index(max(pred)) == label.index(max(label)):
            correct += 1
    train_acc = correct/len(X)
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    for batch_id, (input_ids, attention_masks, labels) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        out = model(input_ids=input_ids, attention_masks=attention_masks)
        labels = labels.tolist()
        labels = torch.FloatTensor([[0, 1] if l == 0 else [1, 0] for l in labels]).to(device)
        loss = loss_fn(out, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, labels)
        
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (input_ids, attention_masks, labels) in enumerate(tqdm_notebook(valid_dataloader)):
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = torch.FloatTensor([[0, 1] if l == 0 else [1, 0] for l in labels]).to(device)
        out = model(input_ids=input_ids, attention_masks=attention_masks)
        valid_acc += calc_accuracy(out, labels)
    print("epoch {} validation acc {}".format(e+1, valid_acc / (batch_id+1)))

# 6. Test

In [None]:
test_acc = 0.0
model.eval()
for batch_id, (input_ids, attention_masks, labels) in enumerate(tqdm_notebook(test_dataloader)):
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    labels = torch.FloatTensor([[0, 1] if l == 0 else [1, 0] for l in labels]).to(device)
    out = model(input_ids=input_ids, attention_masks=attention_masks)
    test_acc += calc_accuracy(out, labels)
print("Test acc : {}".format(test_acc / (batch_id+1)))

Task 4. 기존 코드에서 문제점/아쉬운 점들에 대해서 논하세요. 

해당 내용은 여기 코드 및 결과와 함께 보고서로 작성해서 제출하세요.

해당 문제점에 대한 해결 방안 및 이를 실제로 적용한 코드 작성 및 결과를 첨부한 경우 가산점이 있습니다.