<a href="https://colab.research.google.com/github/SOL1archive/KoGrammar/blob/main/distil_train_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
try:
  import google.colab
  IN_COLAB = True
  
except:
  IN_COLAB = False

IN_COLAB

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install torchmetrics
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in i

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import warnings
warnings.filterwarnings('ignore')
import datetime
import os
import gc
from collections import namedtuple
from pprint import pprint
from tqdm import tqdm

import numpy as np
import pandas as pd

import tensorboard
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import torchmetrics

from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import BartConfig, T5Config
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

## Settings

In [None]:
MANUAL_TRAINING = True
MANUAL_VALIDATION = True
NUM_EPOCHS = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Loading Tokenizer & Model Checkpoint

In [None]:
config = BartConfig.from_json_file('./drive/MyDrive/text_processing/config.json')
tokenizer = AutoTokenizer.from_pretrained("./drive/MyDrive/text_processing/")
checkpoint = torch.load('./drive/MyDrive/text_processing/pytorch_model.bin')
model = AutoModelForSeq2SeqLM.from_config(config)
model_distil = AutoModelForSeq2SeqLM.from_config(config)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
model.load_state_dict(checkpoint)
model_distil.load_state_dict(checkpoint)

<All keys matched successfully>

In [None]:
model_distil.model.decoder.layers = nn.Sequential(model.model.decoder.layers[0], 
                                           model.model.decoder.layers[2],
                                           model.model.decoder.layers[5],)

## Loading Datasets

In [None]:
tokenized_dataset_path = './drive/MyDrive/text_processing/dataset'

In [None]:
tokenized_dataset = load_from_disk(tokenized_dataset_path)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1016426
    })
    train_baseline: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 508213
    })
    train_distil: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 508212
    })
    valid: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 56468
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 56469
    })
})

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='pt')

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:

total_loss_lt = []
batch_loss_lt = []

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
optimizer = AdamW(model_distil.parameters(), lr=2e-5)
trainset = tokenized_dataset['train_distil'].with_format("torch", device=device)
dataloader = DataLoader(trainset, 
                        batch_size=10, 
                        shuffle=True, 
                        #collate_fn=lambda lt: pad_sequence(lt, 
                        #                                   batch_first=True, 
                        #                                   padding_value=tokenizer.pad_token_id
                        #                                   )
                        )
if not next(model_distil.parameters()).is_cuda and device == torch.device('cuda'):
    model_distil.to(device)

model_distil.train()
for epoch in range(NUM_EPOCHS):
    for batch in tqdm(dataloader):
        X = {
                'input_ids': batch['input_ids'],
                'attention_mask': batch['attention_mask'],
            }
        y = batch['labels']
        outputs = model_distil(**X, labels=y)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        gc.collect()
        torch.cuda.empty_cache()
        batch_loss_lt.append(loss.item())

    total_loss_lt += batch_loss_lt
    batch_loss_series = pd.Series(batch_loss_lt)
    print(f'epoch {epoch + 1} loss: {loss.item()} mean: {batch_loss_series.mean()}')


100%|██████████| 50822/50822 [7:22:42<00:00,  1.91it/s]

epoch 1 loss: 0.0020301390904933214 mean: 0.00570277881778284





In [None]:
total_loss_series = pd.Series(total_loss_lt)
total_loss_series.plot.line()

## Validation

In [None]:

loss_lt = []

model.eval()
validset = tokenized_dataset['valid'].with_format("torch", device=device)
dataloader = DataLoader(validset, batch_size=1, shuffle=True)
if not next(model.parameters()).is_cuda and device == torch.device('cuda'):
    model.to(device)

try:
    with torch.no_grad():
        for batch in dataloader:
            X = {
                    'input_ids': batch['input_ids'],
                    'attention_mask': batch['attention_mask'],
                }
            y = batch['labels']
            outputs = model(**X, labels=y)
            loss = outputs.loss
            loss_lt.append(loss.item())
            gc.collect()
            torch.cuda.empty_cache()
except:
    pass

loss_series = pd.Series(loss_lt)
print(f'loss: {loss_series.mean()}')

In [None]:
model.eval()
model_distil.eval()
model.to(device)
model_distil.to(device)
validset = tokenized_dataset['valid'].with_format("torch", device=device)
index_select = 100
test_sample = validset.select(range(index_select,index_select+1))
test_sample_gt = test_sample['labels']
test_sample = test_sample.remove_columns('labels')[0]
test_sample_input = dict()
test_sample_input['input_ids'] = test_sample['input_ids'].unsqueeze(0)
test_sample_input['attention_mask'] = test_sample['attention_mask'].unsqueeze(0)
output = model.generate(**test_sample_input)
output_distil = model_distil.generate(**test_sample_input)
input_text = tokenizer.decode(test_sample_input['input_ids'].squeeze(0), skip_special_tokens=True)
output_text = tokenizer.decode(output.squeeze(0), skip_special_tokens=True)
output_distil_text = tokenizer.decode(output_distil.squeeze(0), skip_special_tokens=True)
gt_text = tokenizer.decode(test_sample_gt.squeeze(0), skip_special_tokens=True)

print(input_text)
print(output_text)
print(output_distil_text)
print(gt_text)

나두안봤어
"나도 안 봤어.". 
나도 안 봤어.
나도 안 봤어.


In [None]:
def eval(model, tokenizer, input_seq, target_seq, metric):
    generated_ids = model.generate(**input_seq)
    generated_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    target_sentence = tokenizer.decoder(target_seq, skip_special_tokens=True)
    score = metric.compute(generated_sentence, target_sentence)

    return score

In [None]:
bleu_scores = []
rouge_scores = []
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

for example in tqdm(validset.shuffle()):
    input_sentence = {
                        'input_ids': example['input_ids'].unsqueeze(0),
                        'attention_mask': example['attention_mask'].unsqueeze(0),
                     }
    
    target_seq = example['labels']
    generated_ids = model.generate(**input_sentence)
    generated_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    target_sentence = tokenizer.decode(target_seq, skip_special_tokens=True)
    
    bleu_score = bleu.compute(predictions=generated_sentence, references=target_sentence)
    rouge_score = rouge.compute(predictions=generated_sentence, references=target_sentence)

    bleu_scores.append(bleu_score)
    rouge_scores.append(rouge_score)

average_bleu_score = pd.Series(bleu_scores).mean()
average_rouge_score = pd.Series(rouge_scores).mean()
pd.concat([average_bleu_score, average_rouge_score], axis=1)

  0%|          | 1/56468 [00:00<8:18:17,  1.89it/s]


ValueError: ignored

## Saving

In [None]:
# To prevent unwanted saves
raise RuntimeError

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    report_to="tensorboard",
    push_to_hub=False,
)

ImportError: ignored

In [None]:
trainer = Seq2SeqTrainer(
    model=model_distil,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['valid'],
    data_collator=data_collator,
)

In [None]:
NOW_STR = datetime.datetime.now().strftime('%y%m%d-%H:%M')
trainer.create_model_card(
    language='Korean',
    tags='Grammar',
    #model='KoGrammar',
    finetuned_from=checkpoint
)
trainer.save_model(f"drive/MyDrive/projects/KoGrammar/models/{NOW_STR}")