In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Input text
text = "The quick brown fox jumps over the lazy dog."

inputs = tokenizer(text, return_tensors="pt")

# Get model outputs (logits)
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits

# Compute log probabilities
log_probs = torch.log_softmax(logits, dim=-1)

# Get log probability of the actual tokens in the input text
token_ids = inputs.input_ids
token_log_probs = log_probs[0, torch.arange(len(token_ids[0])), token_ids[0]]

print("Log probabilities for each token in the input text:")
print(token_log_probs)


Log probabilities for each token in the input text:
tensor([ -7.7088,  -6.7101,  -6.1687,  -8.1749, -11.5447,  -8.3943,  -7.8854,
         -4.2600,  -7.2404, -11.6968])


In [5]:
text1 = "He married a beautiful woman."

inputs = tokenizer(text1, return_tensors="pt")
input_ids = inputs.input_ids

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits

print(logits.shape)  # (batch_size, sequence_length, vocab_size)

log_probs = torch.log_softmax(logits, dim=-1)

# Get log probs of the actual tokens
token_log_probs = log_probs[0, torch.arange(input_ids.shape[1]-1), input_ids[0, 1:]]

# Compute perplexity
avg_log_prob = token_log_probs.mean()  # Mean log probability per token
perplexity = torch.exp(-avg_log_prob)  # Perplexity formula

print(perplexity.item())

torch.Size([1, 6, 50257])
62.60416030883789


In [6]:
text1 = "He married a beautiful woman."

# Tokenize the text
tokens = tokenizer.tokenize(text1)  # Returns a list of tokenized words
token_ids = tokenizer.convert_tokens_to_ids(tokens)  # Convert tokens to IDs

# Print results
print("Tokens:", tokens)
print("Token IDs:", token_ids)

Tokens: ['He', 'Ġmarried', 'Ġa', 'Ġbeautiful', 'Ġwoman', '.']
Token IDs: [1544, 6405, 257, 4950, 2415, 13]


## Understanding GPT-2 Tokenization: The `Ġ` Symbol in Tokens

### **Why Does `Ġ` Appear in Some Tokens?**
When using GPT-2's tokenizer, you may notice that some tokens have a **`Ġ`** prefix. This is because GPT-2 uses **Byte-Pair Encoding (BPE)**, where:
- The `Ġ` symbol represents a **space before the word**.
- GPT-2 **does not tokenize spaces separately**; instead, spaces are merged with words as a prefix.
- The model learns word relationships better when spaces are included in the tokenization.

### **Example Tokenization**
Let's tokenize the sentence `"He married a beautiful woman."`:

```python
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Input text
text = "He married a beautiful woman."

# Tokenize the text
tokens = tokenizer.tokenize(text)  
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print results
print("Tokens:", tokens)
print("Token IDs:", token_ids)

```

### Output

```python
Tokens: ['He', 'Ġmarried', 'Ġa', 'Ġbeautiful', 'Ġwoman', '.']
Token IDs: [1544, 6405, 257, 4950, 2415, 13]
```

### Explanation

- "He" → No Ġ because it's the first word.
- "Ġmarried" → Ġ means it was preceded by a space.
- "Ġa", "Ġbeautiful", "Ġwoman" → Also preceded by spaces.
- "." → No Ġ because punctuation is usually attached directly.

### How to Decode Token IDs Correctly?

You can use `tokenizer.decode()` to automatically restore spaces:

```python
decoded_text = tokenizer.decode(token_ids)
print(decoded_text)
```

### Key Takeaways

✅ The Ġ symbol represents a space before a word in GPT-2's tokenization.

✅ GPT-2 tokenizes based on subwords, keeping spaces attached.

✅ tokenizer.decode() removes Ġ and reconstructs the original text correctly.

✅ You can manually clean tokens with .replace("Ġ", "") if needed.

In [7]:
text1 = "He married a beautiful woman."

inputs = tokenizer(text1, return_tensors="pt")
input_ids = inputs.input_ids

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits

log_probs = torch.log_softmax(logits, dim=-1)

print(log_probs.shape)  # (batch_size, sequence_length, vocab_size)

torch.Size([1, 6, 50257])


### Here are the probabilities of all the tokens after "He married a beautiful" (here it's woman)

In [8]:
log_probabilities = log_probs[0][4]
log_probabilities.shape

torch.Size([50257])

We find the log probability of the word " woman" following "He married a beautiful"

In [9]:
woman_token_id = 2415

# # We can find this back with this piece of code:

# tokens = tokenizer.tokenize(" woman")
# token_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(token_ids)

log_prob = log_probabilities[woman_token_id]
print(log_prob.item())

-8.62405014038086


We look if woman was the most likely word to come after "He married a beautiful".

In [10]:
# find position of max probability
print(torch.argmax(log_probabilities))

tensor(11)


In [11]:
print(log_probabilities[11])

tensor(-1.7113)


In [12]:
token_id = 11

# Convert token ID to its corresponding word
decoded_token = tokenizer.decode([token_id])

print(f"Token ID {token_id} corresponds to: '{decoded_token}'")

Token ID 11 corresponds to: ','


Actually, the most likely word/token after "He married a beautiful" is a comma. But we wrote this sentence so it makes sense. " woman" can also be likely. Let's see for the token " car".

In [13]:
car_token = tokenizer.tokenize(" car")
car_token_id = tokenizer.convert_tokens_to_ids(car_token)[0]

car_log_prob = log_probabilities[car_token_id]

print(car_log_prob.item())

-12.70089340209961


The word " car" is logically less likely (10,000 less) to follow than " woman".

In [14]:
text1 = "He married a beautiful plane."

inputs = tokenizer(text1, return_tensors="pt")
input_ids = inputs.input_ids

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
# logits.shape == (batch_size, sequence_length, vocab_size)      # here (1, 6, 50257)

log_probs = torch.log_softmax(logits, dim=-1)
print("logits and log_probs shape:", logits.shape)

# Get log probs of the actual tokens
token_log_probs = log_probs[0, torch.arange(input_ids.shape[1]-1), input_ids[0, 1:]]
print("token_log_probs:", token_log_probs)

# Compute perplexity
avg_log_prob = token_log_probs.mean()  # Mean log probability per token
print("avg_log_prob:", avg_log_prob)
perplexity = torch.exp(-avg_log_prob)  # Perplexity formula

print(perplexity.item())

logits and log_probs shape: torch.Size([1, 6, 50257])
token_log_probs: tensor([ -9.4455,  -1.9388,  -4.6002, -10.0217,  -3.0335])
avg_log_prob: tensor(-5.8080)
332.9397277832031


In [17]:
log_probs[0, torch.arange(input_ids.shape[1]), input_ids[0, :]]

tensor([ -8.8688,  -8.6120,  -7.8501,  -5.0303,  -7.6311, -11.0583])

In [16]:
input_ids[0, 1:]

tensor([6405,  257, 4950, 6614,   13])

In [23]:
text1 = "He married a beautiful plane."

# Tokenize input text
inputs = tokenizer(text1, return_tensors="pt")
input_ids = inputs.input_ids

# Compute logits (predictions)
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits

# Convert logits to log probabilities
log_probs = torch.log_softmax(logits, dim=-1)

# 1️⃣ First token: log probability of appearing as first token
first_token_log_prob = log_probs[0, 0, input_ids[0, 0]]

# 2️⃣ Rest of the tokens: log probabilities of being predicted
predicted_log_probs = log_probs[0, torch.arange(input_ids.shape[1] - 1), input_ids[0, 1:]]

# Combine them into one tensor
all_log_probs = torch.cat([first_token_log_prob.unsqueeze(0), predicted_log_probs])

# Print results
print("Log probability of first token:", first_token_log_prob.item())
print("Log probabilities of predicted tokens:", predicted_log_probs)
print("All log probabilities:", all_log_probs)
print()


# Compute perplexity
avg_log_prob = all_log_probs.mean()  # Mean log probability per token
perplexity = torch.exp(-avg_log_prob)  # Perplexity formula
print("Perplexity:", perplexity.item())

Log probability of first token: -8.868807792663574
Log probabilities of predicted tokens: tensor([ -9.4455,  -1.9388,  -4.6002, -10.0217,  -3.0335])
All log probabilities: tensor([ -8.8688,  -9.4455,  -1.9388,  -4.6002, -10.0217,  -3.0335])

Perplexity: 554.5197143554688


# Perplexity computation function

In [85]:
def compute_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs.input_ids

    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    log_probs = torch.log_softmax(logits, dim=-1)

    # Get log probs of the actual tokens
    token_log_probs = log_probs[0, torch.arange(input_ids.shape[1]-1), input_ids[0, 1:]]

    # Compute perplexity
    avg_log_prob = token_log_probs.mean()  # Mean log probability per token
    perplexity = torch.exp(-avg_log_prob)  # Perplexity formula

    return perplexity.item()

# Example usage
text1 = "He married a beautiful woman."
text2 = "He married a beautiful car."

ppl1 = compute_perplexity(text1)
ppl2 = compute_perplexity(text2)

print(f"Perplexity of first sentence: {ppl1:.2f}")
print(f"Perplexity of second sentence: {ppl2:.2f}")


Perplexity of first sentence: 62.60
Perplexity of second sentence: 306.76
