In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [2]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collec

In [3]:
# TriviaQA

from datasets import load_dataset
import random

class QABenchmark:
    def __init__(self):
        self.dataset = []

    def sample(self, k: int):
        return random.sample(self.dataset, min(k, len(self.dataset)))

    def first_k(self, k: int):
        return self.dataset[:k]


class TriviaQA(QABenchmark):
    def __init__(self, split='validation'):
        super().__init__()
        loaded_dataset = load_dataset('trivia_qa', 'rc', split=split)
        self.dataset = [(example['question'], list(set([example['answer']['value']] + example['answer']['aliases'])))
                        for example in loaded_dataset]


class Lama(QABenchmark):
    def __init__(self, split: str = 'train'):
        super().__init__()
        loaded_dataset = load_dataset('lama', split=split)
        self.dataset = [(example['masked_sentence'][:-7], example['obj_label']) for example in loaded_dataset
                        if example['masked_sentence'][-7:] == '[MASK].']


def get_optional_in_context_demonstrations_for_triviaqa(size: int = 200):
  trivia_qa_train_set = TriviaQA(split='train')
  return trivia_qa_train_set.first_k(k=size)


def get_triviaqa_validation_set(size: int = 100):
  trivia_qa_train_set = TriviaQA(split='validation')
  return trivia_qa_train_set.sample(k=size)


In [4]:
# GPT2

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def print_output(output: str):
    print("Output:\n" + 100 * '-')
    print(output)


def process_generation(text: str):
    if not text:
        return text
    while text and text[0] in ['\n', ':', ' ', ',', ';']:
        text = text[1:]
    return text


def load_gpt2(model_name: str = 'gpt2-medium'):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
    return model, tokenizer


model, tokenizer = load_gpt2()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


def sampling(input_text: str, max_length=50, temperature=0.7):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    input_ids_len = input_ids.shape[1]
    sample_output = model.generate(
        input_ids,
        do_sample=True,
        max_length=input_ids_len + max_length,
        top_k=0,
        temperature=temperature,
    )
    return process_generation(tokenizer.decode(sample_output[0][input_ids_len:], skip_special_tokens=True))


def beam_search(input_text: str, max_length=20):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    input_ids_len = input_ids.shape[1]
    beam_output = model.generate(
        input_ids,
        max_length=input_ids_len + max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        # output_scores=True,
    )
    return process_generation(tokenizer.decode(beam_output[0][input_ids_len:], skip_special_tokens=True))

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# Evaluation

import pandas as pd

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))


def check_answer_truthfulness(generated_answer, gold_answers):
    if isinstance(gold_answers, str):
        gold_answers = [gold_answers]
    normalized_generation = normalize_text(generated_answer)
    return any([normalize_text(answer) in normalized_generation for answer in gold_answers])

In [6]:
optional_in_context_demonstrations = get_optional_in_context_demonstrations_for_triviaqa(size=500)
validation_set = get_triviaqa_validation_set(size=200)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading and preparing dataset trivia_qa/rc to /root/.cache/huggingface/datasets/trivia_qa/rc/1.2.0/ee76d8a9403e71177e2a3fa7e414d1ee28a79a0970d9176f62f268798aa64b31...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.88G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Dataset trivia_qa downloaded and prepared to /root/.cache/huggingface/datasets/trivia_qa/rc/1.2.0/ee76d8a9403e71177e2a3fa7e414d1ee28a79a0970d9176f62f268798aa64b31. Subsequent calls will reuse this data.




In [26]:
# section 1 - fill in your code here
import pandas as pd

def create_qna_prompt(size: int = 0, inds: list[int] = []) -> str:
  if not size:
    qna = [optional_in_context_demonstrations[i] for i in inds]
  else:
    qna = random.sample(optional_in_context_demonstrations, size)
  prompt = ''
  for (question, answers) in qna:
    prompt += f"Question: {question}\n"
    prompt += f"Answer: {answers[0]}\n"
  return prompt

def add_question_to_prompt(prompt, question):
  return prompt + f"Question: {question}\nAnswer: "

def run_experiment_1(decoding_func) -> pd.DataFrame:
  res_dict = {"PromptSize": [], "Accuracy": []}
  for prompt_size in range(3,9):
    prompt = create_qna_prompt(prompt_size)
    accume_res = 0
    for i, (question, answers) in enumerate(validation_set):
      query_prompt = add_question_to_prompt(prompt, question)
      model_answer = decoding_func(query_prompt)
      accume_res += check_answer_truthfulness(model_answer, answers)
    res_dict["PromptSize"].append(prompt_size)
    res_dict["Accuracy"].append(accume_res/200)
    print(f"Promptsize: {prompt_size} Accuracy: {accume_res/200}")
  return pd.DataFrame(res_dict)

In [None]:
beam_search_df = run_experiment_1(beam_search)

In [42]:
beam_search_df

Unnamed: 0,PromptSize,Accuracy
0,3,0.095
1,4,0.07
2,5,0.07
3,6,0.115
4,7,0.095
5,8,0.1


In [None]:
sampling_df = run_experiment_1(sampling)

In [43]:
sampling_df

Unnamed: 0,PromptSize,Accuracy
0,3,0.065
1,4,0.055
2,5,0.04
3,6,0.1
4,7,0.07
5,8,0.07


In [29]:
from transformers import AutoTokenizer, AutoModel

def cls_pooling(model_output, attention_mask):
    return model_output[0][:,0]

bert_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-cls-token')
bert_model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-cls-token')


def encode_question(question: str):
  encoded_input = bert_tokenizer([question], padding=True, truncation=True, return_tensors='pt')

  with torch.no_grad():
      model_output = bert_model(**encoded_input)

  # Perform pooling. In this case, max pooling.
  sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])

  return sentence_embeddings

Downloading (…)okenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [30]:
# section 2 - fill in your code here
train_mat = torch.cat([encode_question(optional_in_context_demonstrations[i][0]) for i in range(len(optional_in_context_demonstrations))])
validate_mat = torch.cat([encode_question(validation_set[i][0]) for i in range(len(validation_set))])
sim_mat = torch.matmul(train_mat, validate_mat.T)
top_8 = torch.topk(sim_mat, 8, dim=0, largest=False)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [31]:
def run_experiment_2(decoding_func) -> pd.DataFrame:
  res_dict = {"Accuracy": []}
  accume_res = 0
  all_inds = top_8[1].T
  for i, (question, answers) in enumerate(validation_set):
    prompt = create_qna_prompt(inds = all_inds[i])
    model_answer = decoding_func(add_question_to_prompt(prompt, question))
    accume_res += check_answer_truthfulness(model_answer, answers)
  res_dict["Accuracy"].append(accume_res/200)
  print(f"Accuracy: {accume_res/200}")
  return pd.DataFrame(res_dict)

In [None]:
ex_2_beam_search_df = run_experiment_2(beam_search)

In [46]:
ex_2_beam_search_df

Unnamed: 0,Accuracy
0,0.1


In [None]:
ex_2_sample_df = run_experiment_2(sampling)

In [45]:
ex_2_sample_df

Unnamed: 0,Accuracy
0,0.065


In [34]:
lama_validation_set = Lama().sample(200)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading metadata: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]



Downloading and preparing dataset lama/trex to /root/.cache/huggingface/datasets/lama/trex/1.1.0/430016dd70224564ad385a96e0e4a3f88aeb5beaf4e34a8cf65b390fbc83aed7...


Downloading data:   0%|          | 0.00/74.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1304391 [00:00<?, ? examples/s]

Dataset lama downloaded and prepared to /root/.cache/huggingface/datasets/lama/trex/1.1.0/430016dd70224564ad385a96e0e4a3f88aeb5beaf4e34a8cf65b390fbc83aed7. Subsequent calls will reuse this data.


In [35]:
# section 3 - fill in your code here
def run_experiment_3(decoding_func, validation) -> pd.DataFrame:
  res_dict = {"Accuracy": []}
  accume_res = 0
  for i, (question, answers) in enumerate(validation):
    model_answer = decoding_func(question)
    accume_res += check_answer_truthfulness(model_answer, answers)
  res_dict["Accuracy"].append(accume_res/200)
  print(f"Accuracy: {accume_res/200}")
  return pd.DataFrame(res_dict)


In [None]:
zero_shot_beam_search_df = run_experiment_3(beam_search, validation_set)

In [47]:
zero_shot_beam_search_df

Unnamed: 0,Accuracy
0,0.065


In [None]:
zero_shot_sampling_df = run_experiment_3(sampling, validation_set)

In [48]:
zero_shot_sampling_df

Unnamed: 0,Accuracy
0,0.1


In [None]:
zero_shot_beam_search_lama_df = run_experiment_3(beam_search, lama_validation_set)

In [49]:
zero_shot_beam_search_lama_df

Unnamed: 0,Accuracy
0,0.29


In [None]:
zero_shot_sampling_lama_df = run_experiment_3(sampling, lama_validation_set)

In [50]:
zero_shot_sampling_lama_df

Unnamed: 0,Accuracy
0,0.255
