In [None]:
from functools import partial

import torch
import transformers
from torch import Tensor
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import accelerate
import random
from pathlib import Path
from datasets import load_dataset

# SST-2

In [None]:
dataset = load_dataset("sst2")

In [None]:
formatting_prompt = """Text: {0}
Classify the text into negative or positive sentiment.
Sentiment: {1}
"""

def format_text(text, label):
    formatted_text = formatting_prompt.format(text, label)
    return formatted_text

def read_sst_split(dataset_split):
  texts = []
  labels = []
  for text, label in zip(dataset_split['sentence'],dataset_split['label']):
    label = 'positive' if label == 1 else 'negative'
    labels.append(label)
    texts.append(format_text(text,label))
  return texts,labels

In [None]:
sst_texts, sst_labels = read_sst_split(dataset['validation'])

# Fine tuning gpt2 on IMDB for CLM objective

## Prepare dataset

In [None]:
dataset = load_dataset("imdb")

In [None]:
def read_imdb_split(dataset_split):
  texts = []
  labels = []
  for text, label in zip(dataset_split['text'],dataset_split['label']):
    label = 'positive' if label == 1 else 'negative'
    labels.append(label)
    texts.append(format_text(text,label))
  return texts,labels

In [None]:
train_texts, train_labels = read_imdb_split(dataset['train'])
test_texts, test_labels = read_imdb_split(dataset['test'])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_texts[0], train_labels[0]

In [None]:
file_path_train = 'drive/MyDrive/train.txt'
text_data_train = open(file_path_train, 'w')
for text in train_texts:
  text_data_train.write(text)
text_data_train.close()

In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = transformers.TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

## Train

In [None]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,
          model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          ):
  tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = transformers.GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = transformers.TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = transformers.Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [None]:
train_file_path = '/content/drive/MyDrive/train.txt'
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/fine_tuned_models'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 2.0

In [None]:
# train(
#     train_file_path=train_file_path,
#     model_name=model_name,
#     output_dir=output_dir,
#     overwrite_output_dir=overwrite_output_dir,
#     per_device_train_batch_size=per_device_train_batch_size,
#     num_train_epochs=num_train_epochs,
# )

## Inference

In [None]:
def load_model(model_path):
    model = transformers.GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = transformers.GPT2Tokenizer.from_pretrained(tokenizer_path,truncation=True, max_length=1023)
    return tokenizer


def generate_text(sequence, max_new_tokens, model, tokenizer):
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt', truncation=True, max_length=1023)
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_new_tokens=max_new_tokens,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [None]:
model_path = output_dir
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)

### zero-shot classification

In [None]:
sequence = """Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Classify the text into negative or positive sentiment.
Sentiment:"""
max_new_tokens = 2
print(generate_text(sequence, max_new_tokens, model, tokenizer))

In [None]:
sequence = """Text: A waste of time.
Classify the text into negative or positive sentiment.
Sentiment:"""
max_new_tokens = 2
print(generate_text(sequence, max_new_tokens, model, tokenizer))

In [None]:
sequence = """Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Classify the text into negative or positive sentiment.
Sentiment:"""
max_new_tokens = 2
print(generate_text(sequence, max_new_tokens, model, tokenizer))

In [None]:
def classification(data, model, tokenizer):
  predictions = []
  labels = []
  separator = '\nSentiment:'
  for sequence in data:
    input, label = sequence.split(separator)
    generated = generate_text(input+separator,1,model,tokenizer)
    try:
      predictions.append(generated.split(separator)[1].strip())
    except:
      print(generated.split(separator))
      predictions.append('')
    labels.append(label.strip().casefold())
  return predictions, labels

In [None]:
#IMDB
preds, labels = classification(test_texts[:100], model, tokenizer)

In [None]:
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#SST2
preds, labels = classification(sst_texts, model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

#### One-shot classification

In [None]:
def prepend_example(text, example):
    formatted_text = example.format(text)
    return formatted_text

example = """Text: A waste of time. The plot is very boring and the actors are very bad.
Classify the text into negative or positive sentiment.
sentiment: negative

{0}"""

In [None]:
preds, labels = classification([prepend_example(text,example) for text in sst_texts], model, tokenizer)

In [None]:
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#incorrect example
incorrect_example = """Text: A waste of time. The plot is very boring and the actors are very bad.
Classify the text into negative or positive sentiment.
sentiment: positive

{0}"""

In [None]:
preds, labels = classification([prepend_example(text,incorrect_example) for text in sst_texts], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

# Before fine-tuning

In [None]:
model_name = "gpt2"
model = transformers.GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_name)

In [None]:
sequence = """Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Classify the text into negative or positive sentiment.
Sentiment:"""
max_new_tokens = 1
print(generate_text(sequence, max_new_tokens, model, tokenizer))

In [None]:
sequence = """Text: A waste of time.
Classify the text into negative or positive sentiment.
Sentiment:"""
print(generate_text(sequence, max_new_tokens, model, tokenizer))

In [None]:
sequence = """Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Classify the text into negative or positive sentiment.
Sentiment:"""
print(generate_text(sequence, max_new_tokens, model, tokenizer))

### IMBD

In [None]:
#IMBD
preds, labels = classification(test_texts[:100], model, tokenizer)

In [None]:
#IMBD
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

Before fine-tuning, gpt2 is not able to perform sentiment analysis with zero-shot prompting.  \\
Let's see what happens if we give it one example:

In [None]:
def prepend_example(text, example):
    formatted_text = example.format(text)
    return formatted_text

example = """Classify the text into negative or positive sentiment.
Text: A waste of time. The plot is very boring and the actors are very bad.
sentiment: negative

{0}"""

In [None]:
print(generate_text(prepend_example(test_texts[0],example), 1, model, tokenizer))

In [None]:
preds, labels = classification([prepend_example(text,example) for text in test_texts[:100]], model, tokenizer)

In [None]:
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#incorrect example
incorrect_example = """Text: A waste of time. The plot is very boring and the actors are very bad.
Classify the text into negative or positive sentiment.
sentiment: positive

{0}"""

In [None]:
#incorrect example
preds, labels = classification([prepend_example(text,incorrect_example) for text in test_texts[:100]], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

### SST-2

In [None]:
preds, labels = classification(sst_texts, model, tokenizer)

In [None]:
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#one-shot
print(generate_text(prepend_example(sst_texts[0]), 1, model, tokenizer))

In [None]:
preds, labels = classification([prepend_example(text,example) for text in sst_texts], model, tokenizer)

In [None]:
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#incorrect example
preds, labels = classification([prepend_example(text,incorrect_example) for text in sst_texts], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

On IMDB, giving the model one example the accuracy increases a bit, but the model is still far from being able to perform the task. For SST-2 the accuracy increases a lot more, this is expected since the shorter sentences are easier to classify.Interestingly the accuracy increases even more if the moded is provided with an incorrect example.

# Logit Lens

In [None]:
#fine-tuned
# model_path = output_dir
# gpt2 = load_model(model_path)
# gpt2_tokenizer = load_tokenizer(model_path)

In [None]:
#before fine-tuning
model_name = "gpt2"
gpt2 = transformers.GPT2LMHeadModel.from_pretrained(model_name)
gpt2_tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_name)


In [None]:
print(gpt2.base_model.h)

For each layer:
1. Normalize the output using the final layernorm
2. Compute the word distribution using the word embeddings
3. Find the most likely token

In [None]:
def compute_logits(
        prompt,
        model,
        tokenizer,
        ): # Computes the logits for each token in the vocabulary for each layer of the model.
        final_layernorm = model.base_model.ln_f
        word_embeddings = model.base_model.wte
        tokenized_prompt = tokenizer.encode(prompt, return_tensors = "pt") # tokenized prompt


        n_layers_model = len(model.base_model.h)  # 12 for GPT2
        outputs = [None] * n_layers_model

        #store the output of each layer in outputs
        def save_output_layer_hook(module, input, output, layer_index):
                outputs[layer_index] = output[0].detach()

        #add the forward hook to each layer
        hooks = [block.register_forward_hook(partial(save_output_layer_hook, layer_index=i))
                for i,block in enumerate(model.base_model.h)]

        #run the model
        try:
                with torch.no_grad():
                        model(tokenized_prompt)
        finally:
                for hook in hooks:
                        hook.remove()

        per_layer_logits = []
        per_layer_best_token = []

        for layer_output in outputs:

                normalized_output = final_layernorm(layer_output) #axs: (batch, tokens, 768)
                #word_embeddings.weight.T: (768, 50257) # embedding

                # compute the "cosine similarity" between the normalized output and the embedding matrix
                word_distribution = torch.matmul(normalized_output, word_embeddings.weight.T)[0] #axs: (batch,) token, vocab

                best_token = torch.argmax(word_distribution, dim=-1)

                per_layer_logits.append(word_distribution)
                per_layer_best_token.append(best_token)

        per_layer_logits = torch.stack(per_layer_logits)
        per_layer_best_token = torch.stack(per_layer_best_token)

        return per_layer_logits, per_layer_best_token


In [None]:
def get_logits_at_preds(logits, preds): #Float[Tensor, "layer nb_tokens vocab=50257"], Int[Tensor, "nb_tokens"]
    #logits: logits/probabilities of all possible tokens for each layer
    #preds: best token for final layer
    #logit/probability of the final layer output token for each layer
    return np.stack([logits[:, j, preds[j]] for j in range(preds.shape[-1])], axis=-1) #layer, token

def plot_logit_lens(
    layer_logits, #Float[Tensor, "layer nb_tokens vocab=50257"]
    layer_preds, #Int[Tensor, "layer nb_tokens"]  #best token per layer
    layer_probs, #Float[Tensor, "layer nb_tokens vocab=50257"] #softmax of logits
    tokenizer, #transformers.tokenization_utils.PreTrainedTokenizer
    input_ids, #Int[Tensor, "batch=1 nb_tokens"]
    start_ix=0, #start index of the input_ids #int
    layer_names=None,
    probability=False,
):
    input_ids = torch.cat([input_ids, torch.tensor([[50256]])], dim=1)

    end_ix = start_ix + layer_logits.shape[1]

    final_preds = layer_preds[-1] #Int[Tensor, "nb_tokens"] #best token final layer

    aligned_preds = layer_preds #Int[Tensor, "layer nb_tokens"] #best token per layer

    numeric_input = layer_probs if probability else layer_logits #Float[Tensor, "layer nb_tokens vocab=50257"]

    to_show = get_logits_at_preds(numeric_input, final_preds) #Float[Tensor, "layer nb_tokens"]  #logit/prob of the final layer output token for each layer

    aligned_texts = []
    for layer in per_layer_best_token:
        aligned_texts.append([tokenizer.decode(x) for x in layer])

    aligned_texts = np.array(aligned_texts) #(layer, token)

    to_show = to_show[::-1] #reverse the order of the layers

    aligned_texts = aligned_texts[::-1] #reverse the order of the layers

    fig = plt.figure(figsize=(1.5 * to_show.shape[1], 0.375 * to_show.shape[0]))

    plot_kwargs = {"annot": aligned_texts, "fmt": ""}

    if probability:
        plot_kwargs.update({"cmap": "Blues_r",
                            "vmin": 0,
                            "vmax": 1})
    else:
        vmin = np.percentile(to_show.reshape(-1), 5)
        vmax = np.percentile(to_show.reshape(-1), 95)

        plot_kwargs.update(
            {
                "cmap": "Blues",
                "vmin": vmin,
                "vmax": vmax,
            }
        )

    sns.heatmap(to_show, **plot_kwargs)

    ax = plt.gca()
    input_tokens_str = np.array([tokenizer.decode(x) for x in input_ids[0]])


    if layer_names is None:
        layer_names = ["Layer {}".format(n) for n in range(to_show.shape[0])]
    ylabels = layer_names[::-1] #reverse the order of the layers
    ax.set_yticklabels(ylabels, rotation=0)

    ax_top = ax.twiny() #create a twin Axes sharing the yaxis

    padw = 0.5 / to_show.shape[1] #padding width
    # ax.set_xticks(np.linspace(padw, 1 - padw, to_show.shape[1]))
    ax_top.set_xticks(np.linspace(padw, 1 - padw, to_show.shape[1])) #it is needed otherwise the ticks start from 0

    ax_inputs = ax
    ax_targets = ax_top

    ax_inputs.set_xticklabels(input_tokens_str[start_ix:end_ix], rotation=0)

    starred = [
        "* " + true if pred == true else " " + true
        for pred, true in zip(
            aligned_texts[0], input_tokens_str[start_ix + 1 : end_ix + 1]
        )
    ]
    ax_targets.set_xticklabels(starred, rotation=0)

In [None]:
def logit_lens(prompt,model,tokenizer):
  per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
  plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

## Before fine-tuning

In [None]:
model_name = "gpt2"
gpt2 = transformers.GPT2LMHeadModel.from_pretrained(model_name)
gpt2_tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_name)


In [None]:
zero_shot = sst_texts[42]
one_shot = prepend_example(prompt,example)

In [None]:
prompt = zero_shot
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
prompt = one_shot
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
prompt = """Classify the text into negative or positive sentiment.
Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
prompt = """Classify the text into negative or positive sentiment.
Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

Let's see if something changes if we just invert the order of the instruction and the text in the prompt:

In [None]:
prompt = """Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Classify the text into negative or positive sentiment.
Sentiment: positive"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
prompt = """Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Classify the text into negative or positive sentiment.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

Logit lens lets us see that the model already at some layers produces the correct answer for the sentiment analysis task! Still, it's not able to fully understand the task without examples.

In [None]:
#one-shot:
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#one-shot:
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

We can see that one example is enough for the model to understand the task and sometimes it gives us the correct answer. The model keeps guessing the correct sentiment in some intermediate layers.

In [None]:
# 2 examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
# 2 examples, inverted order
prompt = """Classify the text into negative or positive sentiment.
Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: A waste of time.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

The model has surely understood what it's being asked from the examples, but keeps giving the wrong answer. But logit lens lets us see that the model is more uncertain than when provided with only one example, and shifts between the correct and the incorrect label in the last layers.

In [None]:
# 3 examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: Soooo boooring
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

3 examples seem still not enough for the model to output the correct answer. Logit lens lets us see that the model is still uncertain about it, let's see if with a fourth example it becomes better.

In [None]:
#4 examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: Soooo boooring
Sentiment: negative

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

Now the model is more sure of the correct answer.

## Incorrect examples

In [None]:
#one incorrect example
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

With an incorrect example the model is able to predict the correct label, contrary to what happens with a correct example. This is evidence that the model may be simply guessing.

In [None]:
# 2 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

The model basically performs like it was performing with 2 correct examples

In [None]:
# 3 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: Soooo boooring
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#4 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: Soooo boooring
Sentiment: positive

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

Incorrect examples have no effect on the model.

# Fine-tuned gpt2

In [None]:
model_path = output_dir
gpt2 = load_model(model_path)
gpt2_tokenizer = load_tokenizer(model_path)

In [None]:
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
prompt = """Classify the text into negative or positive sentiment.
Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

### With incorrect examples

In [None]:
#one incorrect example
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#compare with the prediction with one correct example
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

When the text is preceded by an incorrect example, logit lens lets us see that the model is more uncertain about its prediction, even if it remains correct.

In [None]:
# 2 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
# compare with 2 correct examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
# 3 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: Soooo boooring
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
# compare it to when there are 3 correct examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: Soooo boooring
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#4 incorrect examples
prompt = """Instruction: Classify the text into negative or positive sentiment.
Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: negative

Text: A waste of time.
Sentiment: positive

Text: Soooo boooring
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#4 correct examples
prompt = """Instruction: Classify the text into negative or positive sentiment.
Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive

Text: A waste of time.
Sentiment: negative

Text: Soooo boooring
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

When the text is preceded by incorrect examples, logit lens lets us see that the model is more uncertain about its prediction, even if it remains correct.

In [None]:
#5 incorrect examples
prompt = """Instruction: Classify the text into negative or positive sentiment.
Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: negative

Text: A waste of time.
Sentiment: positive

Text: Soooo boooring
Sentiment: positive

Text: Higly recommeded!
Sentiment: negative

Text: I could rewatch this film 1000 times and still enjoy it!
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, gpt2, gpt2_tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    gpt2_tokenizer,
    gpt2_tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

No matter how many incorrect example we give to the fine-tuned model, it still robustly predicts the correct label.

#gpt2-large

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-large")

In [None]:
model

In [None]:
#classification

#IMBD
preds, labels = classification(test_texts[:100], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

#SST2
preds, labels = classification(sst_texts, model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

#one-shot

#imdb
preds, labels = classification([prepend_example(text,example) for text in test_texts[:100]], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")


In [None]:
#sst2
preds, labels = classification([prepend_example(text,example) for text in sst_texts], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#incorrect example
#IMBD
preds, labels = classification([prepend_example(text,incorrect_example) for text in test_texts[:100]], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

#incorrect example
#SST2
preds, labels = classification([prepend_example(text,incorrect_example) for text in sst_texts], model, tokenizer)
accuracy = np.mean([1 if pred==label else 0 for pred,label in zip(preds,labels)])
print(f"accuracy: {accuracy}")

In [None]:
#4 correct examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time. The plot is very boring and the actors are very bad.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: Soooo boooring
Sentiment: negative

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#4 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time. The plot is very boring and the actors are very bad.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: Soooo boooring
Sentiment: positive

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

We are starting to see the effect of *overthinking*: the model predicts the correct example at intermediate layers but than, at the last layer, the predictions shifts to the incorrect one!


# gpt2-medium

In [None]:
#gpt2-medium
model = transformers.AutoModelForCausalLM.from_pretrained('gpt2-medium')
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-medium")

In [None]:
#4 correct examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: Soooo boooring
Sentiment: negative

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#4 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: Soooo boooring
Sentiment: positive

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

Incorrect examples make the model mistake, but there are intermediate layers where the model would have guessed correctly.


# Distil gpt2

In [None]:
#distil gpt2
model = transformers.AutoModelForCausalLM.from_pretrained('distilgpt2')
tokenizer = transformers.AutoTokenizer.from_pretrained("distilgpt2")

In [None]:
#4 correct examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: negative

Text: I saw this movie with my friends and we all loved it.
Sentiment: positive

Text: Soooo boooring
Sentiment: negative

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: positive

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

In [None]:
#4 incorrect examples
prompt = """Classify the text into negative or positive sentiment.
Text: A waste of time.
Sentiment: positive

Text: I saw this movie with my friends and we all loved it.
Sentiment: negative

Text: Soooo boooring
Sentiment: positive

Text: The best movie I have ever seen. The plot is very interesting and the actors are very good.
Sentiment: negative

Text: The movie is not bad, but it is not good either. The plot is very boring and the actors are very bad.
Sentiment: negative"""

In [None]:
per_layer_logits, per_layer_best_token = compute_logits(prompt, model, tokenizer)
plot_logit_lens(
    per_layer_logits.detach(),
    per_layer_best_token.detach(),
    per_layer_logits.softmax(dim=-1).detach(),
    tokenizer,
    tokenizer.encode(prompt, return_tensors = "pt"), #input_ids
    start_ix=0,
    layer_names=None,
    probability=True,
)

Surprisingly, the incorrect examples make the model more confident of the right answer.