In [None]:
!pip install -q transformers torch torchvision

In [None]:
import pandas as pd

model_checkpoint = "gpt2"
batch_size = 2

with open('100KStories.csv', 'rb') as csv_file:
    csv_data = pd.read_csv(csv_file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_set = csv_data[:500]
train_set_input = train_set['sentence1'] + ' ' + train_set['sentence2'] + ' ' + train_set['sentence3']
train_set_output = train_set['sentence4'] + ' ' + train_set['sentence5'] 
print(train_set_input.values.tolist())

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
    train_set_input = txt_list['sentence1'] + ' ' + txt_list['sentence2'] + ' ' + txt_list['sentence3']
    train_set_output = txt_list['sentence4'] + ' ' + txt_list['sentence5'] 
    train_set_input = train_set_input.values.tolist()
    train_set_output = train_set_output.values.tolist()
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    self.label_ids = []
    for i in range(len(train_set_input)):
      input_encodings_dict = tokenizer('<|startoftext|>'+ train_set_input[i]+ '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
      output_encodings_dict = tokenizer('<|startoftext|>' + train_set_output[i] + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
      print(tokenizer.decode(output_encodings_dict["input_ids"]))
      self.input_ids.append(torch.tensor(input_encodings_dict.get('input_ids')))
      self.label_ids.append(torch.tensor(output_encodings_dict.get('input_ids')))
      self.attn_masks.append(torch.tensor(input_encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx], self.label_ids[idx]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
dataset = GPT2Dataset(train_set, tokenizer, max_length=768)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

Output hidden; open in https://colab.research.google.com to view.

In [None]:
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
import numpy as np
import random
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(model_checkpoint, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(model_checkpoint, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
import datetime
import time
from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))



In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    200. Loss: 0.21794334053993225.   Elapsed: 0:01:36.
0:  bipartisanIn first he she he...

  Average training loss: 0.55
  Training epoch took: 0:03:11

Running Validation...
  Validation Loss: 0.21
  Validation took: 0:00:15

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    200. Loss: 0.2036871463060379.   Elapsed: 0:01:36.
0:  increasingI, his.. the had them.. had to,. it

  Average training loss: 0.20
  Training epoch took: 0:03:11

Running Validation...
  Validation Loss: 0.21
  Validation took: 0:00:15

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of    200. Loss: 0.18419426679611206.   Elapsed: 0:01:36.
0: dayJ was it the.!. on to the the. was. it!

  Average training loss: 0.18
  Training epoch took: 0:03:15

Running Validation...
  Validation Loss: 0.21
  Validation took: 0:00:15

Training complete!
Total training took 0:10:22 (h:mm:ss)


In [None]:
import os
output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.json',
 './model_save/merges.txt',
 './model_save/added_tokens.json')

In [None]:
!ls -l --block-size=K ./model_save/
!ls -l --block-size=M ./model_save/pytorch_model.bin

total 499796K
-rw-r--r-- 1 root root      1K Mar  2 06:24 added_tokens.json
-rw-r--r-- 1 root root      1K Mar  2 06:24 config.json
-rw-r--r-- 1 root root    446K Mar  2 06:24 merges.txt
-rw-r--r-- 1 root root 498448K Mar  2 06:24 pytorch_model.bin
-rw-r--r-- 1 root root      1K Mar  2 06:24 special_tokens_map.json
-rw-r--r-- 1 root root      1K Mar  2 06:24 tokenizer_config.json
-rw-r--r-- 1 root root    878K Mar  2 06:24 vocab.json
-rw-r--r-- 1 root root 487M Mar  2 06:24 ./model_save/pytorch_model.bin


In [None]:
data_dir = os.path.join('/content/drive/', "'My Drive'","'Colab Notebooks'")
!cp -r ./model_save/ $data_dir

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 2s (882 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155320 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
model_name = "boundmc-finetuned-rocstories"
model.push_to_hub(model_name)

Cloning https://huggingface.co/msintaha/gpt2-finetuned-rocstories into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.38k/487M [00:00<?, ?B/s]

To https://huggingface.co/msintaha/gpt2-finetuned-rocstories
   ca2bf84..3adbeee  main -> main



'https://huggingface.co/msintaha/gpt2-finetuned-rocstories/commit/3adbeeef5411c9539192d88bb5a236ae97731b0e'

In [None]:
model.save_pretrained("msintaha/{}".format(model_name), push_to_hub=True)

Cloning https://huggingface.co/msintaha/gpt2-finetuned-rocstories into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.83k/487M [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/487M [00:00<?, ?B/s]

(None,)

In [None]:
model.eval()

prompt = "<|startoftext|>Gloria wanted to find a special new dress for her anniversary dinner."

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 500,
                                top_p=0.95, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))