# Load the pre-trained model and generate text, measuring its perplexity

## Pieces of code were borrowed from the following public repositories and tutorials:
### https://mccormickml.com/2019/07/22/BERT-fine-tuning/
### https://snappishproductions.com/blog/2020/03/01/chapter-9-text-generation-with-gpt-2-and-only-pytorch.html.html
### Hugging Face Language model fine-tuning script https://huggingface.co/transformers/v2.0.0/examples.html#language-model-fine-tuning
### Rey Farhan: Easy GPT2 fine-tuning with Hugging Face and PyTorch https://colab.research.google.com/drive/13dZVYEOMhXhkXWfvSMVM1TTtUDrT6Aeh?usp=sharing#scrollTo=EKOTlwcmxmej
### Code for measuring perplexity is based on https://huggingface.co/transformers/perplexity.html and taken from https://github.com/huggingface/transformers/issues/4147

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 51.1 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 
import time
import datetime
from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt
% matplotlib inline

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
# mount my Google Drive directory and access the training data located there
gdrive_dir = '/content/drive/'
data_dir = os.path.join(gdrive_dir, "'My Drive'","'Colab Notebooks'")

drive.mount(gdrive_dir, force_remount=True)

Mounted at /content/drive/


In [None]:
cd '/content/drive/MyDrive/Colab Notebooks/model_save'

/content/drive/MyDrive/Colab Notebooks/model_save


In [None]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/model_save")
!ls

added_tokens.json  merges.txt	      special_tokens_map.json  vocab.json
config.json	   pytorch_model.bin  tokenizer_config.json


In [None]:
# Tell pytorch to run this model on the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
!ls -l --block-size=K "/content/drive/MyDrive/Colab Notebooks/model_save"

total 499775K
-rw------- 1 root root      1K Jul  6 06:47 added_tokens.json
-rw------- 1 root root      1K Jul  6 06:47 config.json
-rw------- 1 root root    446K Jul  6 06:47 merges.txt
-rw------- 1 root root 498448K Jul  6 06:47 pytorch_model.bin
-rw------- 1 root root      1K Jul  6 06:47 special_tokens_map.json
-rw------- 1 root root      1K Jul  6 06:47 tokenizer_config.json
-rw------- 1 root root    878K Jul  6 06:47 vocab.json


In [None]:

!ls -l --block-size=M "/content/drive/MyDrive/Colab Notebooks/model_save/pytorch_model.bin"

# Copy the model files to a directory in your Google Drive.
#!cp -r '/content/drive/MyDrive/Colab Notebooks/model_save' $data_dir
output_dir = '/content/drive/MyDrive/Colab Notebooks/model_save'
# # Load a trained model and vocabulary that you have fine-tuned
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model.to(device)

-rw------- 1 root root 487M Jul  6 06:47 '/content/drive/MyDrive/Colab Notebooks/model_save/pytorch_model.bin'


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [None]:
model.eval()
sentence = 'CSS Dimensions'
input_ids = tokenizer.encode(sentence, return_tensors='pt')
#greedy output
#output= model.generate(input_ids, max_length=1024)
#best-possible output achieved by adding more parameters
output = model.generate(input_ids, min_length=1024, max_length=1024, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, early_stopping=True).to('cpu')
# average result param
#output = model.generate(input_ids, do_sample=True, max_length=50, top_p=0.92, top_k=0)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [None]:
output

tensor([[49155, 41265,   389,  ...,   340, 12229,   663]])

In [None]:
print(tokenizer.decode(output[0], skip_special_tokens=True))


CSS Dimensions are measured in pixels by using the square root of the x- and y-axis.,The, property specifies the first browser version that fully supports the 
, property.,For a complete list of all available HTML tags, visit our,.,Get certified,by completing,a course today! Exercises in,tuition,tuples, and Matplotlib are now 100% complete and you don't need a registration to use them!,Good luck!,If you have any +/- comments or questions, kindly feel free to send me an e-mail at,gives me a small percentage (at no extra cost to you) and I will try to answer them in a nice and open-ended manner.,I'm currently working as a Software Engineer at Joyent, which means I'm actively involved in the development and testing of HTML, CSS, XHTML, jQuery, Bootstrap, Webpack, VB Scripting, angular and angular-components. I do my best to keep my time and motivation to myself, so if you find me useful, I would be greatly appreciated and look forward to reading your comments and contributions to the res

# Perplexity

In [None]:
import math

In [None]:
 def calculatePerplexity(sentence,model,tokenizer):
        input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0) 
        input_ids = input_ids.to('cpu')
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]
        return math.exp(loss)

In [None]:
print(calculatePerplexity(sentence, model, tokenizer))

11914676.948942969
