In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

'Process in Colab' if IN_COLAB else 'Process in Local'

'Process in Local'

In [2]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')

In [3]:
# 프로젝트 디렉토리로 이동: 경우에 맞게 설정
%cd drive/MyDrive/projects/ClauseSummary

[Errno 2] No such file or directory: 'drive/MyDrive/projects/ClauseSummary'
/home/thesol1/projects/ClauseSummary


In [4]:
import os
if IN_COLAB:
    !pip install git+https://github.com/CarperAI/trlx
    !pip install transformers
    !pip install datasets
    !pip install torchtyping
    !pip install wandb

In [5]:
import datetime
from typing import List

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as Fs

from tqdm.notebook import tqdm
from datasets import load_from_disk, load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModel, AutoModelForSeq2SeqLM

import trlx
from trlx.trlx import train
from trlx.data.default_configs import (
    ModelConfig,
    OptimizerConfig,
    PPOConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)

[2023-07-07 14:17:51,662] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-07-07 14:17:52.716585: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-07 14:17:53.752650: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/wsl/lib:
2023-07-07 14:17:53.752781: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/wsl/lib:


In [6]:
class TokenizeMapWrapper:
    def __init__(self, tokenizer, feature, option=None):
        if option is None:
            option = {
                'max_length': 4096,
                'truncation': True,
                'padding': 'max_length',
            }

        self.feature = feature
        self.tokenizer = tokenizer

    def __call__(self, row):
        return self.tokenizer(row[self.feature], **self.option)

    def __repr__(self):
        return f'{self.__class__.__name__}(tokenizer={self.tokenizer})'

class RewardTokenizeMapWrapper(TokenizeMapWrapper):
    def __init__(self, tokenizer, text_feature, summary_feature, max_token=4096, prompt='summarization-num_lines-1: ', option=None):
        if option is None:
            option = {
                'max_length': max_token,
                'truncation': True,
            }

        self.prompt = prompt
        self.max_token = option['max_length']
        self.option = option
        self.text_feature = text_feature
        self.summary_feature = summary_feature
        self.tokenizer = tokenizer

    def __call__(self, row):
        text = row[self.text_feature]
        summary = row[self.summary_feature]

        tokenized_text = self.tokenizer(text, **self.option)
        tokenized_summary = self.tokenizer(summary, **self.option)
        tokenized_total_text = dict()
        for key in tokenized_text:
            if len(tokenized_text['input_ids']) + len(tokenized_summary['input_ids']) < self.max_token:
                tokenized_total_text[key] = tokenized_text[key] + tokenized_summary[key]
            else:
                tokenized_total_text[key] = (tokenized_text[key][:- len(tokenized_summary['input_ids'])]
                                             + tokenized_summary[key]
                )
            tokenized_total_text[key] = (tokenized_total_text[key]
                                         + ([1] * (self.max_token - len(tokenized_total_text[key])))
            )
        return tokenized_total_text

In [7]:
def tokenize_text_summary(text: str, summary: str, tokenizer, option=None):
    if option is None:
        option = {
            'max_length': 4096,
            'truncation': True,
        }
    max_token = option['max_length']

    if text.startswith('summarization-num_lines-1: '):
        text = text[len('summarization-num_lines-1: '):]
    
    tokenized_text = tokenizer(text, **option)
    tokenized_summary = tokenizer(summary, **option)

    tokenized_total_text = dict()
    for key in tokenized_text:
        if len(tokenized_text['input_ids']) + len(tokenized_summary['input_ids']) < max_token:
            tokenized_total_text[key] = tokenized_text[key] + tokenized_summary[key]
        else:
            tokenized_total_text[key] = (tokenized_text[key][:- len(tokenized_summary['input_ids'])]
                                         + tokenized_summary[key]
            )
        tokenized_total_text[key] = (tokenized_total_text[key]
                                     + ([1] * (max_token - len(tokenized_total_text[key])))
        )
    return tokenized_total_text

In [8]:
class ModelForRewardGeneration(nn.Module):
    def __init__(self, encoder_path, hidden_size=256):
        super(ModelForRewardGeneration, self).__init__()
        self.encoder = AutoModel.from_pretrained(encoder_path)
        self.hidden_size = hidden_size
        self.head1 = nn.Sequential(
            nn.Linear(768, 1024, bias=False),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout1d(0.2),
            nn.Linear(1024, 1024, bias=False),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout1d(0.2),
            nn.Linear(1024, 512, bias=False),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout1d(0.1),
            nn.Linear(512, hidden_size, bias=False),
            nn.BatchNorm1d(hidden_size),
            nn.GELU(),
        )
        self.head2 = nn.Sequential(
            nn.Linear(hidden_size, 1),
        )

    def forward(self, input_ids=None, attention_mask=None):
        x = self.encoder(input_ids, attention_mask).pooler_output
        x = self.head1(x)
        x = self.head2(x)
        return x

    def representation_forward(self, input_ids=None, attention_mask=None):
        x = self.encoder(input_ids, attention_mask).pooler_output
        x = self.head1(x)
        return x
    
    def load(self, model_path):
        self.encoder = AutoModel.from_pretrained(model_path + '-encoder')
        self.head1.load_state_dict(torch.load(model_path + '-head1.pt'))
        self.head2.load_state_dict(torch.load(model_path + '-head2.pt'))

In [9]:
SAVE_STR = datetime.datetime.now().strftime('%y-%m-%d-%H:%M')

### Config

In [10]:
original_dataset_path = './data/dataset-term.json'
checkpoint = 'KETI-AIR-Downstream/long-ke-t5-base-summarization'

reward_model_checkpoint = 'psyche/kolongformer-4096'
reward_model_path = './model/230707-03 06'

dataset_path = f'./data/dataset-term'
tokenized_dataset_path = f'./data/{checkpoint.replace("/", "-")}dataset-term-tokenized'
model_save_path = f'./model/{SAVE_STR}-summary-model'

### Loading Dataset, Tokenizers & Models

In [11]:
AutoModel.from_pretrained('./model/230707-03 06-encoder/')

In [12]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_checkpoint)
reward_model = ModelForRewardGeneration(reward_model_checkpoint, 128)
reward_model.load(reward_model_path)

# Load dataset
df = pd.read_json(original_dataset_path)
df['text'] = 'summarization-num_lines-1: ' + df['text'] + ' </s> '
df = df[['text']]
if not os.path.exists(dataset_path):
    dataset = Dataset.from_pandas(df)
    dataset.save_to_disk(dataset_path)
else:
    dataset = load_from_disk(dataset_path)

dataset_dict = dataset.train_test_split(test_size=0.1, seed=42)

Some weights of the model checkpoint at psyche/kolongformer-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerModel were not initialized from the model checkpoint at psyche/kolongformer-4096 and are newly initialized: ['longformer.pooler.dense.weight', 'longformer.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [13]:
train_set = [sample["text"] for sample in dataset_dict["train"]]
val_set = [sample["text"] for sample in dataset_dict["test"]]

### PPO

In [14]:
default_config = TRLConfig(
    train=TrainConfig(
        seq_length=400,
        epochs=100,
        total_steps=100000,
        batch_size=3,
        checkpoint_interval=10000,
        eval_interval=100,
        pipeline="PromptPipeline",
        trainer="AcceleratePPOTrainer",
        save_best=False,
    ),
    model=ModelConfig(
        model_path=checkpoint,
        num_layers_unfrozen=-1,
        model_arch_type="seq2seq",
    ),
    tokenizer=TokenizerConfig(
        tokenizer_path=checkpoint,
        padding_side="right",
        truncation_side="right",
    ),
    optimizer=OptimizerConfig(
        name="adamw",
        kwargs={
            "lr": 5.0e-5,
            "betas": [0.9, 0.999],
            "eps": 1.0e-8,
            "weight_decay": 1.0e-6,
        },
    ),
    scheduler=SchedulerConfig(
        name="cosine_annealing",
        kwargs={
            "T_max": 100000,
            "eta_min": 5.0e-5,
        },
    ),
    method=PPOConfig(
        name="PPOConfig",
        num_rollouts=128,
        chunk_size=12,
        ppo_epochs=4,
        init_kl_coef=0.05,
        target=6,
        horizon=10000,
        gamma=0.99,
        lam=0.95,
        cliprange=0.2,
        cliprange_value=0.2,
        vf_coef=1,
        scale_reward=None,
        ref_mean=None,
        ref_std=None,
        cliprange_reward=10,
        gen_kwargs={
            "max_new_tokens": 400,
            "do_sample": True,
            "top_k": 0,
            "top_p": 0.9,
            "eos_token_id": -1,
        },
    ),
)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
reward_model = reward_model.to(device)
reward_model.eval()

def get_reward(samples: List[str]):
    reward_lt = []
    for sample in samples:
        sample = {
            'prompt': sample[:sample.find(' </s> ')],
            'label': sample[sample.find(' </s> ') + len(' </s> '):]
        }
        tokenized_total_text = tokenize_text_summary(sample['prompt'], sample['label'])
        score = reward_model(
            input_ids=torch.tensor(tokenized_total_text['input_ids']).to(device),
            attention_mask=torch.tensor(tokenized_total_text['attention_mask']).to(device)
        )
        reward_lt.append(score)
    
    rewards = torch.cat(reward_lt, dim=0)
    return rewards

In [None]:
trainer = train(
    prompts=train_set,
    eval_prompts=val_set,
    reward_fn=get_reward,
    config=default_config,
)

In [None]:
trainer.create_model_card(
    model_name='tosan-base',
    finetuned_from=checkpoint
)

trainer.save_model(model_save_path)