In [None]:
import torch

if torch.cuda.is_available():    
   
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Load Encoded Tokens

In [None]:
import pickle as pkl

In [None]:
with open('./Data/encoded_token/en_tokens_50k.pkl','rb') as f:
    input_ids = pkl.load(f)
    attention_masks = pkl.load(f)
    labels = pkl.load(f)

## Training & Validation Split

In [None]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 20

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

# 4. Train our Model

In [None]:
from transformers import BertForMaskedLM, AdamW, BertConfig


with open('./Data/tokenizer.pkl','rb') as f:
    tokenizer= pkl.load(f)
    
model = BertForMaskedLM.from_pretrained(
    "bert-base-uncased",
    output_attentions = True,
    output_hidden_states = False, 
)

model.resize_token_embeddings(len(tokenizer))

model.cuda()

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

## Optimizer & Learning Rate Scheduler

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )


In [None]:
from transformers import get_linear_schedule_with_warmup


epochs = 5

total_steps = len(train_dataloader) * epochs


scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

## Training Loop

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random
import numpy as np
import os
import pickle

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []


total_t0 = time.time()

out_dir = './model_save/0725_pilot9_v2/'
output_dir = out_dir + 'weight/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    
    t0 = time.time()

  
    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        

        loss, logits, attention_t = model(b_input_ids, 
                                         token_type_ids=None, 
                                         attention_mask=b_input_mask,
                                         masked_lm_labels=b_labels)
        if epoch_i == epochs-1:
            if step % 100 == 0:
                with open(output_dir + 'attention_train_' + str(step) + '.pickle', 'wb') as out_awt:
                    pickle.dump(attention_t, out_awt)

                with open(output_dir + 'input_train_' + str(step) + '.pickle', 'wb') as out_idt:
                    pickle.dump(b_input_ids, out_idt)
                
                
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    for step, batch in enumerate(validation_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            loss, logits, attention_v = model(b_input_ids, 
                                                token_type_ids=None, 
                                                attention_mask=b_input_mask,
                                                masked_lm_labels=b_labels)
  
        if epoch_i == epochs-1:
            if step % 50 == 0:
                with open(output_dir + 'attention_val_' + str(step) +'.pickle','wb') as out_awv:
                    pickle.dump(attention_v, out_awv)

                with open(output_dir + 'input_val_' + str(step) +'.pickle', 'wb') as out_idv:
                    pickle.dump(b_input_ids, out_idv)
                
                
        total_eval_loss += loss.item()
        
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
import pandas as pd

pd.set_option('precision', 2)

df_stats = pd.DataFrame(data=training_stats)

df_stats = df_stats.set_index('epoch')

df_stats

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

sns.set(style='darkgrid')

sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

# A. Saving & Loading Fine-Tuned Model

In [None]:
import os

output_dir = out_dir

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model  
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)