In [2]:
from datasets import load_dataset

from transformers import GPT2LMHeadModel, GPT2Tokenizer

import torch

In [3]:
huggingface_dataset_name = "squad"
dataset = load_dataset(huggingface_dataset_name)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print(f"Example {i + 1}")

    print(dash_line)
    print(f"context: \n{dataset['train'][index]['context']}")
    print(f"question: \n{dataset['train'][index]['question']}")
    print(f"answers: \n{dataset['train'][index]['answers']['text'][0]}")


---------------------------------------------------------------------------------------------------
Example 1
---------------------------------------------------------------------------------------------------
context: 
Notre Dame is known for its competitive admissions, with the incoming class enrolling in fall 2015 admitting 3,577 from a pool of 18,156 (19.7%). The academic profile of the enrolled class continues to rate among the top 10 to 15 in the nation for national research universities. The university practices a non-restrictive early action policy that allows admitted students to consider admission to Notre Dame as well as any other colleges to which they were accepted. 1,400 of the 3,577 (39.1%) were admitted under the early action plan. Admitted students came from 1,311 high schools and the average student traveled more than 750 miles to Notre Dame, making it arguably the most representative university in the United States. While all entering students begin in the College of

In [5]:
model_name = "gpt2-medium"
original_model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### 零样本微调

In [12]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

for i, index in enumerate(example_indices):
    context = dataset['train'][index]['context']
    question = dataset['train'][index]['question']
    answer = dataset['train'][index]['answers']['text'][0]

    prompt = f"""
Given the following context and an certain answer,
please generate a certain question corresponding to the context and the answer:
Context: {context}
Answer: {answer}
Question:
"""

    input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
    pad_token_id = tokenizer.eos_token_id

    outputs = tokenizer.decode(original_model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=pad_token_id)[0], skip_special_tokens=True)

    print(dash_line)
    print(f"Example {i+1}")
    print(f"context: \n{context}")
    print(f"question: \n{question}")
    print(f"answer: \n{answer}")
    print(f"Generated question: \n{outputs}")

---------------------------------------------------------------------------------------------------
Example 1
context: 
Notre Dame is known for its competitive admissions, with the incoming class enrolling in fall 2015 admitting 3,577 from a pool of 18,156 (19.7%). The academic profile of the enrolled class continues to rate among the top 10 to 15 in the nation for national research universities. The university practices a non-restrictive early action policy that allows admitted students to consider admission to Notre Dame as well as any other colleges to which they were accepted. 1,400 of the 3,577 (39.1%) were admitted under the early action plan. Admitted students came from 1,311 high schools and the average student traveled more than 750 miles to Notre Dame, making it arguably the most representative university in the United States. While all entering students begin in the College of the First Year of Studies, 25% have indicated they plan to study in the liberal arts or social scie

### 少样本微调

In [15]:
def make_prompt(example_indices_full, example_index_to_questions):
    prompt = """
Given the following context and an certain answer,
please generate a question corresponding to the context and the answer:
"""

    for index in example_indices_full:
        context = dataset['train'][index]['context']
        question = dataset['train'][index]['question']
        answer = dataset['train'][index]['answers']['text'][0]
        prompt += f"""
Context: {context}.
Answer: {answer}.
Question: {question}.
"""

    context = dataset['train'][example_index_to_questions]['context']
    question = dataset['train'][example_index_to_questions]['question']
    answer = dataset['train'][example_index_to_questions]['answers']['text'][0]
    prompt += f"""
Context: {context}.
Answer: {answer}.
Question:
"""

    return prompt

example_indices_full = [10, 20]
example_index_to_questions = 30
few_shot_prompt = make_prompt(example_indices_full, example_index_to_questions)
print(few_shot_prompt)


Given the following context and an certain answer,
please generate a question corresponding to the context and the answer:

Context: The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching..
Answer: Rome.
Question: Where is the headquarters of the Congregation of the Holy Cross?.

Context: All of Notre Dame's undergraduate students are a part of one

In [16]:
context = dataset['train'][example_index_to_questions]['context']
question = dataset['train'][example_index_to_questions]['question']
answer = dataset['train'][example_index_to_questions]['answers']['text'][0]

input_ids = tokenizer.encode(few_shot_prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
pad_token_id = tokenizer.eos_token_id
outputs = tokenizer.decode(original_model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=pad_token_id)[0], skip_special_tokens=True)

print(dash_line)
print(f"context: \n{context}")
print(f"question: \n{question}")
print(f"answer: \n{answer}")
print(f"Generated question: \n{outputs}")

---------------------------------------------------------------------------------------------------
context: 
The Joan B. Kroc Institute for International Peace Studies at the University of Notre Dame is dedicated to research, education and outreach on the causes of violent conflict and the conditions for sustainable peace. It offers PhD, Master's, and undergraduate degrees in peace studies. It was founded in 1986 through the donations of Joan B. Kroc, the widow of McDonald's owner Ray Kroc. The institute was inspired by the vision of the Rev. Theodore M. Hesburgh CSC, President Emeritus of the University of Notre Dame. The institute has contributed to international policy discussions about peace building practices.
question: 
What is the title of Notre Dame's Theodore Hesburgh?
answer: 
President Emeritus of the University of Notre Dame
Generated question: 

Given the following context and an certain answer,
please generate a question corresponding to the context and the answer:

Cont