<a href="https://colab.research.google.com/github/Oh-Seokjin/Goorm/blob/main/pj01_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import requirements

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 15.2 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 100.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [None]:
pip install sentencepiece==0.1.91

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 15.4 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91


In [None]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    T5Model,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoConfig,
    AdamW
)

from pprint import pprint

# 1. Preprocess

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join('../content/drive/MyDrive/Goorm/project01/data', file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
!ls

drive  sample_data


In [None]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [None]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        # 라벨링
        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)
    
    # data와 label return
    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [None]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [None]:
for i, item in enumerate(train_dataset):
    print(item)
    if i == 10:
        break

(array([1287,  542,    3,    5,    1]), array([1]))
(array([7857,  884,  313,    3,    5,    1]), array([1]))
(array([  79,   92,   43, 1444,  534,    7,   11,    3,  867, 3022,   84,
         19,  310,  207,    3,    5,    1]), array([1]))
(array([   34,     3,    31,     7,     3,     9,   207,     3, 28458,
        3534,     9,  4044,     3,     5,     1]), array([1]))
(array([   8,  871,   19, 2609,    3,    5,    1]), array([1]))
(array([ 207, 1207,  542,    3,    5,    1]), array([1]))
(array([207, 313,   3,   5,   1]), array([1]))
(array([ 5759,    13,   239,    19, 13178,    11,  1995,    13,   534,
           7,     3,     5,     1]), array([1]))
(array([  248,   286,    21,  3074,    42,  1207, 12751,    11,  6061,
           3,     5,     1]), array([1]))
(array([   8,  126,  620, 1416, 1237,    3,    5,    1]), array([1]))
(array([ 48, 286,  47, 182, 207,   3,   5,   1]), array([1]))


In [None]:
def collate_fn_style(samples):
    # 나눠줌
    input_ids, labels = zip(*samples)
    # 배치 최대 길이 
    max_len = max(len(input_id) for input_id in input_ids)
    # 길이 기준으로 정렬한 index return 
    #sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]
    sorted_indices = range(len(input_ids[0]))

    # 패딩 넣어줌, batch_first=True -> shape = B*max, batch_first=False -> shape = max*B(element-wise)
    input_ids = pad_sequence([torch.tensor(input_ids[0][index]) for index in sorted_indices],
                             batch_first=True)

    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, labels

In [None]:
elems = train_dataset[:10]



In [None]:
print(elems)

(array([list([1287, 542, 3, 5, 1]), list([7857, 884, 313, 3, 5, 1]),
       list([79, 92, 43, 1444, 534, 7, 11, 3, 867, 3022, 84, 19, 310, 207, 3, 5, 1]),
       list([34, 3, 31, 7, 3, 9, 207, 3, 28458, 3534, 9, 4044, 3, 5, 1]),
       list([8, 871, 19, 2609, 3, 5, 1]), list([207, 1207, 542, 3, 5, 1]),
       list([207, 313, 3, 5, 1]),
       list([5759, 13, 239, 19, 13178, 11, 1995, 13, 534, 7, 3, 5, 1]),
       list([248, 286, 21, 3074, 42, 1207, 12751, 11, 6061, 3, 5, 1]),
       list([8, 126, 620, 1416, 1237, 3, 5, 1])], dtype=object), array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]]))


In [None]:
sorted_indices2 = range(len(elems[0]))

In [None]:
print([i for i in sorted_indices2])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [None]:
input_ids2 = pad_sequence([torch.tensor(elems[0][index]) for index in sorted_indices2],
                             batch_first=True)

In [None]:
print(input_ids2)

tensor([[ 1287,   542,     3,     5,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [ 7857,   884,   313,     3,     5,     1,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [   79,    92,    43,  1444,   534,     7,    11,     3,   867,  3022,
            84,    19,   310,   207,     3,     5,     1],
        [   34,     3,    31,     7,     3,     9,   207,     3, 28458,  3534,
             9,  4044,     3,     5,     1,     0,     0],
        [    8,   871,    19,  2609,     3,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  207,  1207,   542,     3,     5,     1,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  207,   313,     3,     5,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [ 5759,    13,   239,    1

In [None]:
train_batch_size=32
eval_batch_size=64

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [None]:
# random seed
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = T5ForConditionalGeneration.from_pretrained("t5-small")

model.to(device)

In [None]:
model.train()
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [None]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [None]:
train_epoch = 3
lowest_valid_loss = 9999.
for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, lm_labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            lm_labels = lm_labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           lm_labels=lm_labels)

            loss = output.loss
            loss.backward()

            optimizer.step()

            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, lm_labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        lm_labels = lm_labels.to(device, dtype=torch.long)

                        output = model(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       lm_labels=lm_labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in lm_labels]

                        predictions += batch_predictions
                        target_labels += batch_labels

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)
                if lowest_valid_loss > valid_loss:
                    print('Acc for model which have lower valid loss: ', acc)
                    torch.save(model.state_dict(), "./pytorch_model.bin")

In [None]:
import pandas as pd
test_df = pd.read_csv('../content/drive/MyDrive/Goorm/pj01/data/test_no_label.csv')

In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
test[:10]

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)
    #sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]
    sorted_indices = range(len(input_ids))

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])

    return input_ids, attention_mask

In [None]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [None]:
with torch.no_grad():
    model.eval()
    predictions = []
    for input_ids, attention_mask in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        output = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       )

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions += batch_predictions

In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission.csv', index=False)