In [41]:
import json
f = open('GPT-3_paragraphs.json')
paragraphs = json.load(f)

In [55]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("voidful/bart-eqg-question-generator")

model = AutoModelForSeq2SeqLM.from_pretrained("voidful/bart-eqg-question-generator")

def get_question(context, max_length=1024):
  input_text = context 
  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)

  return tokenizer.decode(output[0])

context = "In the same 1936 paper in which he introduced the universal computing\nmachine, Alan Turing also provided an answer to this question\nby introducing (and proving) that there are in fact problems that cannot be\ncomputed by a universal computing machine.\nThe problem that\nhe proved undecidable, using proof techniques almost identical to those\ndeveloped for similar problems in the 1880s, is now known as { the\nhalting problem}."

question = get_question(context).strip('</s>')
question

'what was the problem that Turing proved undecidable?'

In [38]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("ainize/klue-bert-base-mrc")
model = AutoModelForQuestionAnswering.from_pretrained("ainize/klue-bert-base-mrc")

encodings = tokenizer(context, question, max_length=512, truncation=True,
                      padding="max_length", return_token_type_ids=False)
encodings = {key: torch.tensor([val]) for key, val in encodings.items()}             

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]

pred = model(input_ids, attention_mask=attention_mask)

start_logits, end_logits = pred.start_logits, pred.end_logits

token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)

pred_ids = input_ids[0][token_start_index: token_end_index + 1]

prediction = tokenizer.decode(pred_ids)
prediction

'Alan Turing'

In [47]:
def ainze(context, question):
    tokenizer = AutoTokenizer.from_pretrained("ainize/klue-bert-base-mrc")
    model = AutoModelForQuestionAnswering.from_pretrained("ainize/klue-bert-base-mrc")

    encodings = tokenizer(context, question, max_length=512, truncation=True,
                        padding="max_length", return_token_type_ids=False)
    encodings = {key: torch.tensor([val]) for key, val in encodings.items()}             

    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]

    pred = model(input_ids, attention_mask=attention_mask)

    start_logits, end_logits = pred.start_logits, pred.end_logits

    token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)

    pred_ids = input_ids[0][token_start_index: token_end_index + 1]

    prediction = tokenizer.decode(pred_ids)
    
    return prediction

In [48]:
# Load Transformers library
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

# Load fine-tuned MRC model by HuggingFace Model Hub
HUGGINGFACE_MODEL_PATH = "bespin-global/klue-bert-base-mrc"
tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_PATH )
model = AutoModelForQuestionAnswering.from_pretrained(HUGGINGFACE_MODEL_PATH )

# Encoding
encodings = tokenizer(context, question, 
                      max_length=512, 
                      truncation=True,
                      padding="max_length", 
                      return_token_type_ids=False
                      )
encodings = {key: torch.tensor([val]) for key, val in encodings.items()}             
input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]

# Predict
pred = model(input_ids, attention_mask=attention_mask)

start_logits, end_logits = pred.start_logits, pred.end_logits
token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
pred_ids = input_ids[0][token_start_index: token_end_index + 1]

# Decoding
prediction = tokenizer.decode(pred_ids)
prediction

'1936 paper in which he introduced the universal computing machine, Alan Turing also provided an answer to this question by introducing ( and proving ) that there are in fact problems that cannot be computed by a universal computing machine. The problem that he proved undecidable, using proof techniques almost identical to those developed for similar problems in the 1880s'

In [49]:
def bespin(context, question):
    HUGGINGFACE_MODEL_PATH = "bespin-global/klue-bert-base-mrc"
    tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_PATH )
    model = AutoModelForQuestionAnswering.from_pretrained(HUGGINGFACE_MODEL_PATH )

    # Encoding
    encodings = tokenizer(context, question, 
                        max_length=512, 
                        truncation=True,
                        padding="max_length", 
                        return_token_type_ids=False
                        )
    encodings = {key: torch.tensor([val]) for key, val in encodings.items()}             
    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]

    # Predict
    pred = model(input_ids, attention_mask=attention_mask)

    start_logits, end_logits = pred.start_logits, pred.end_logits
    token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
    pred_ids = input_ids[0][token_start_index: token_end_index + 1]

    # Decoding
    prediction = tokenizer.decode(pred_ids)
    return prediction

In [58]:
bespin_data = []
ainze_data = []
for row in paragraphs:
    context = row['positive_ctxs']['text']
    question = get_question(context).strip('</s>')

    b = bespin(context, question)
    a = ainze(context, question)

    data = {}
    data['positive_ctxs'] = {}
    data['positive_ctxs']['text'] = context
    data['positive_ctxs']['title'] = row['positive_ctxs']['title']
    data['quesiton'] = question
    data['answer'] = a
    ainze_data.append(data)

    datab = {}
    datab['positive_ctxs'] = {}
    datab['positive_ctxs']['text'] = context
    datab['positive_ctxs']['title'] = row['positive_ctxs']['title']
    datab['quesiton'] = question
    datab['answer'] = b
    bespin_data.append(datab)

In [60]:
# with open('bespin.json', 'w', encoding='utf-8') as f:
#     json.dump(bespin_data, f, ensure_ascii=False, indent=4) 

In [61]:
# with open('ainze.json', 'w', encoding='utf-8') as f:
#     json.dump(ainze_data, f, ensure_ascii=False, indent=4) 