<a href="https://colab.research.google.com/github/Oh-Seokjin/Text_Classifier_goorm/blob/main/BERT_wandb%EC%82%AC%EC%9A%A9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import requirements

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 15.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 85.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [None]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW
)

In [None]:
# Tokenizer & Model을 같이 바꿀수 있어야 한다.
# bert 학습을 하면서 tokenizer가 같이 학습한다.
# 모델마다 tokenizer가 동작하는 방식이 살짝씩 다르다.
# pair로 움직인다.
# huggingface 를 googling 해보자.

In [None]:
from transformers import AutoTokenizer

# 1. Preprocess

In [None]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [None]:
def make_data_strings(file_name):
    data_strings = []
    with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
        id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
# 데이터 전처리가 정말. 의미가 있을까.
# attention value 를 시각화.
# huggingface visualize attention 을 찾아보기.
# 결정적이지 않은 값들은 attention을 주지 않는다.

In [None]:
# 데이터를 탐색해서 수행했을때, 장단점이 존재.

In [None]:
# for i, encoded in enumerate(train_pos):
#     encoded = encoded.split(' ')
#     line = ""
#     for word in encoded:
#         line += tokenizer.decode(int(word))
#         line += ' '
#     print(line)
#     if i == 10:
#         break

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', num_labels=2)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
from google.colab import files
uploaded = files.upload()

Saving sentiment.dev.0 to sentiment.dev.0
Saving sentiment.dev.1 to sentiment.dev.1
Saving sentiment.train.0 to sentiment.train.0
Saving sentiment.train.1 to sentiment.train.1
Saving test_no_label.csv to test_no_label.csv


In [None]:
# !ls

In [None]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [None]:
# train_pos[:10]

In [None]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [None]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [None]:
# for i, item in enumerate(train_dataset):
#     print(item)
#     if i == 10:
#         break

In [None]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)
    sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

In [None]:
# from tensorflow.python.client import device_lib
# device_lib.list_local_devices() # gpu 확인

In [None]:
# !cat /proc/meminfo   # 메모리 확인 두개확인해서, batch_size 늘릴수 있으면 늘려보기.

In [None]:
!pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:
# code 공부와 hyper parameter. 실험과 분석결과, 시행착오.

In [None]:
# random seed
# random_seed=42
random_seed=33
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)

In [None]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [None]:
model.train()
# learning_rate = 5e-5  -> 너무 작은게 아닐까 좀더 키워보자.
learning_rates = [2e-5, 3e-5]
batch_sizes = [256, 64, 128]


for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        
        optimizer = AdamW(model.parameters(), lr=learning_rate)

        train_epoch = 3
        lowest_valid_loss = 9999.


        # train_batch_size=32
        train_batch_size=batch_size
        # eval_batch_size=32
        eval_batch_size=batch_size

        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=train_batch_size,
                                                shuffle=True, collate_fn=collate_fn_style,
                                                pin_memory=True, num_workers=2)
        dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                                shuffle=False, collate_fn=collate_fn_style,
                                                num_workers=2)

       
        wandb.init()
        wandb.run.name = 'batch_'+str(batch_size)+'_lr_'+str(learning_rate)
        for epoch in range(train_epoch):
            with tqdm(train_loader, unit="batch") as tepoch:
                for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
                    tepoch.set_description(f"Epoch {epoch}")

                    input_ids = input_ids.to(device)
                    attention_mask = attention_mask.to(device)
                    token_type_ids = token_type_ids.to(device)
                    position_ids = position_ids.to(device)
                    labels = labels.to(device, dtype=torch.long)

                    optimizer.zero_grad()

                    output = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                token_type_ids=token_type_ids,
                                position_ids=position_ids,
                                labels=labels)

                    loss = output.loss
                    # wandb.log({"train_loss":loss.item()})          

                    loss.backward()

                    # wandb 정보 추가를 이 코드내부에다.
                    optimizer.step()

                    # value 의 역할과 목적 동작방식등을 보는것도 중요하다.
                    # traing loss, [validation loss], accuracy.

                    tepoch.set_postfix(loss=loss.item())
                    if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                        # Evaluate the model five times per epoch
                        with torch.no_grad():
                            model.eval()
                            valid_losses = []
                            predictions = []
                            target_labels = []
                            for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                        desc='Eval',
                                                                                                        position=1,
                                                                                                        leave=None):
                                input_ids = input_ids.to(device)
                                attention_mask = attention_mask.to(device)
                                token_type_ids = token_type_ids.to(device)
                                position_ids = position_ids.to(device)
                                labels = labels.to(device, dtype=torch.long)

                                output = model(input_ids=input_ids,
                                            attention_mask=attention_mask,
                                            token_type_ids=token_type_ids,
                                            position_ids=position_ids,
                                            labels=labels)

                                logits = output.logits
                                loss = output.loss
                                valid_losses.append(loss.item())

                                batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                                batch_labels = [int(example) for example in labels]

                                predictions += batch_predictions
                                target_labels += batch_labels

                        acc = compute_acc(predictions, target_labels)
                        valid_loss = sum(valid_losses) / len(valid_losses)
                        if lowest_valid_loss > valid_loss:
                            print('Acc for model which have lower valid loss: ', acc)
                            torch.save(model.state_dict(), "./pytorch_model.bin")
                        # train loss 보다 valid loss 를 더 valuable 하다고 볼수있다.
        
                        wandb.log({"accuracy":acc, "valid_loss":valid_loss, "learning_rate":learning_rate, "batch_size":batch_size})    
        wandb.finish()   

            

In [None]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')

In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
test[:10]

['101 2009 1005 1055 1037 2878 2047 3325 1998 2047 26389 2169 2051 2017 2175 1012 102',
 '101 2061 15640 2013 2019 2214 5440 1012 102',
 '101 2009 2003 1996 2087 14469 7273 1999 1996 3028 1012 102',
 '101 2079 2025 3696 1037 10084 2007 2122 2111 1012 102',
 '101 1045 2001 6091 1998 2016 2081 2033 2514 2061 6625 1998 6160 1012 102',
 '101 1996 2069 2518 2057 2363 2008 2001 2980 2001 1996 4157 1012 102',
 '101 2053 1010 2025 1996 3924 2012 2004 2226 1010 1996 3924 1999 3502 2152 1012 102',
 '101 2027 3288 2009 2041 2392 2005 2017 1998 2024 2200 14044 1012 102',
 '101 4606 1996 12043 2106 1050 1005 1056 2130 2113 2129 2000 2147 1996 3274 1012 102',
 '101 2027 2031 2019 6581 4989 1997 25025 2015 2000 5454 2013 1012 102']

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)
    # test 에는 labeling 표기되어있지않고, kaggle에 정답 데이터로 매칭되어있는데, 해당 labeling은 바뀌지 않는데, data의 sample 순서가 바뀌면서 버그가 발생한다.
    # sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]
    sorted_indices = range(len(input_ids))

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],  batch_first=True)

    attention_mask = torch.tensor(  [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in sorted_indices])

    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
# test_batch_size = 32
test_batch_size = 64
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, collate_fn=collate_fn_style_test, num_workers=2)

In [None]:
with torch.no_grad():
    model.eval()
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,    desc='Test',    position=1,   leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions += batch_predictions


Test:   0%|          | 0/16 [00:00<?, ?it/s][A
Test:   6%|▋         | 1/16 [00:00<00:01,  8.38it/s][A
Test:  31%|███▏      | 5/16 [00:00<00:00, 22.22it/s][A
Test:  56%|█████▋    | 9/16 [00:00<00:00, 28.41it/s][A
Test:  81%|████████▏ | 13/16 [00:00<00:00, 31.59it/s][A
                                                     [A

In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission.csv', index=False)