# GPT-2
<b>Date:</b> October 6, 2023\
<b>Author:</b> Dimitris Lymperopoulos\
<b>Description:</b> A notebook for experimentation with GPT-2 huggingface transformers

## Package Installation
Run the cell below to download and install the necessary python packages for GPT-2 transformers

In [None]:
!pip install transformers
!pip install torch
!pip install pylev

## Imports

In [78]:
import torch
import numpy as np
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from pylev import levenshtein as lev_dist

In [1]:
%run ../functions/GPT2_functions.ipynb

  from .autonotebook import tqdm as notebook_tqdm


## Additional Functions Definition

In [72]:
def model_init(model_string='gpt2', cuda=False):
    """
    A function that initializes a LM and a Tokenizer based on GPT2. 

    :param model_string: string representing the base model for the transformer and the tokenizer
    :param cuda: boolean value, determining whether or not to use gpu for model inference
    :return: the pretrained model and tokenizer
    """
    if model_string.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(model_string)
        model = GPT2LMHeadModel.from_pretrained(model_string)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(model_string)
        model = OpenAIGPTLMHeadModel.from_pretrained(model_string)
    model.eval()
    if cuda:
        model.to('cuda')
    print("Model init")
    return model, tokenizer

In [73]:
def sent_scoring(model, tokenizer, text, cuda=False):
    """
    A function that uses the given LM and Tokenizer to compute the probability of a given sentence.

    :param model: a pretrained transformer model
    :param tokenizer: a pretrained tokenizer
    :param text: a string representing the sentence whose probability will be computed
    :param cuda: boolean value, determining whether or not to use gpu for model inference
    :return: the computed loss of the sentence and log_probability of the last token
    """
    assert model is not None
    assert tokenizer is not None
    tokens = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")
    if cuda:
        tokens = tokens.to('cuda')
    with torch.no_grad():
        outputs = model(tokens, labels=tokens)
    loss, logits = outputs[:2]
    loss, log_prob = loss.item(), logits[0, -1, tokens[0, -1]].item()
    return loss, log_prob

## Experiments

In [74]:
# initialize model and tokenizer
model, tokenizer = model_init()

Model init


In [75]:
# Experiment 1
sentences = ['A dog is embraced by the woman', 'A woman is embraced by the table', 'A is embraced the woman']

for i, s in enumerate(sentences):
    loss, prob = sent_scoring(model, tokenizer, s)
    print("{}.\nSentence: {}\nLoss: {}".format(i+1, s, loss, prob), end="\n\n")

1.
Sentence: A dog is embraced by the woman
Loss: 5.209733486175537

2.
Sentence: A woman is embraced by the table
Loss: 5.908300876617432

3.
Sentence: A is embraced the woman
Loss: 7.431522369384766



In [76]:
# Experiment 2
sentences = ['A table is hugging the man', 'A woman is hugging the man', 'The man is hugged by the woman']

for i, s in enumerate(sentences):
    loss, prob = sent_scoring(model, tokenizer, s)
    print("{}.\nSentence: {}\nLoss: {}".format(i+1, s, loss, prob), end="\n\n")

1.
Sentence: A table is hugging the man
Loss: 6.892320156097412

2.
Sentence: A woman is hugging the man
Loss: 5.655217170715332

3.
Sentence: The man is hugged by the woman
Loss: 4.746158123016357



In [86]:
# Experiment 3 - Levenshtein distance as closeness metric
sentence_pair = ['A woman is hugging the man', 'The man is hugged by the woman']
s1 = sentence_pair[0].lower().split()
s2 = sentence_pair[1].lower().split()

# TODO: perhaps lemmatize before computing the distance (?)
d = lev_dist(s1, s2)
print("Levenshtein Distance: {}".format(d))

Levenshtein Distance: 5
