In [78]:
from importlib import reload
import data.loader as loader_module
import evaluation.predict as predict_module
import evaluation.eval as eval_module
import data.squad_loader as squad_loader_module
import json


reload(predict_module)
reload(eval_module)
reload(loader_module)
reload(squad_loader_module)

from evaluation.predict import ModelPredictor
from evaluation.eval import evaluate_generation, evaluate_classification, get_confusion_matrix
from data.loader import Dataset, get_dataset
from data.squad_loader import get_squad_dataset

## Load dataset

In [79]:
train_set, test_set = get_squad_dataset(
    10,
    split_percentage=1.0,
    percent_types=(0.34, 0.33, 0.33),
    types=("answerable", "unanswerable_ner", "unanswerable_base"),
)
print(len(train_set))
print(train_set[-1])
train_set.data[-2:]


10
{'context': 'The Districts of Germany (Kreise) are administrative districts, and every state except the city-states of Berlin, Hamburg, and Bremen consists of "rural districts" (Landkreise), District-free Towns/Cities (Kreisfreie Städte, in Baden-Württemberg also called "urban districts", or Stadtkreise), cities that are districts in their own right, or local associations of a special kind (Kommunalverbände besonderer Art), see below. The state Free Hanseatic City of Bremen consists of two urban districts, while Berlin and Hamburg are states and urban districts at the same time.', 'plausible_answers': [{'text': 'urban districts', 'answer_start': 259}], 'question': 'What are towns and cities with districts called?', 'id': '5a516e40ce860b001aa3fdaa', 'answers': [], 'is_impossible': True}


[{'context': 'Both Roger Williams and John Clarke, his compatriot and coworker for religious freedom, are variously credited as founding the earliest Baptist church in North America. In 1639, Williams established a Baptist church in Providence, Rhode Island, and Clarke began a Baptist church in Newport, Rhode Island. According to a Baptist historian who has researched the matter extensively, "There is much debate over the centuries as to whether the Providence or Newport church deserved the place of \'first\' Baptist congregation in America. Exact records for both congregations are lacking."',
  'question': 'When was the first American Baptist church established?',
  'id': '5727dcc53acd2414000dee4a',
  'answers': [{'text': '1639', 'answer_start': 172}],
  'is_impossible': False},
 {'context': 'The Districts of Germany (Kreise) are administrative districts, and every state except the city-states of Berlin, Hamburg, and Bremen consists of "rural districts" (Landkreise), District-free Tow

In [105]:
for i in range(min(len(train_set), 10)):
    print(train_set.data[i])


{'context': 'New Haven Harbor is home to the Port of New Haven, a deep-water seaport with three berths capable of hosting vessels and barges as well as the facilities required to handle break bulk cargo. The port has the capacity to load 200 trucks a day from the ground or via loading docks. Rail transportation access is available, with a private switch engine for yard movements and private siding for loading and unloading. Approximately 400,000 square feet (40,000 m2) of inside storage and 50 acres (200,000 m2) of outside storage are available at the site. Five shore cranes with a 250-ton capacity and 26 forklifts, each with a 26-ton capacity, are also available.', 'question': 'How many trucks does the Port of New Haven have the daily capacity to load?', 'id': '5727e04eff5b5019007d974b', 'answers': [{'text': '200', 'answer_start': 225}], 'is_impossible': False}
{'context': 'Although official German air doctrine did target civilian morale, it did not espouse the attacking of civilians 

## Predict

In [75]:
model_name = "gpt-3.5-turbo-instruct"
model = ModelPredictor(model_name)

In [101]:
def extract_unknown(text):
    if "no answer" in text.lower():
        return "" # evaluation process wants empty string for predictions of unanswerable questions
    else:
        return text


In [102]:
prompt_template = """Given the following context:
CONTEXT: {context}
Answer the following question, if there's not enough information to answer the question, write "No Answer".
QUESTION: {question}
ANSWER: """

results_generate, original_preds = model.generate(
    dataset=train_set,
    prompt_template=prompt_template,
    extract_function=extract_unknown, 
    num_samples=1,
    max_tokens=50,
)
results_generate, original_preds

(['200 trucks',
  '\nLarge-scale raids were conducted, in which the use of mines and incendiaries for tactical expediency came close to indiscriminate bombing.',
  'Ibn Khafaja',
  '16 billion paper cups per year',
  " The Chalcedoni Terminal 1's.",
  '95%',
  '950',
  '',
  '1639',
  ' District-free Towns/Cities (Kreisfreie Städte).'],
 ['200 trucks',
  '\nLarge-scale raids were conducted, in which the use of mines and incendiaries for tactical expediency came close to indiscriminate bombing.',
  'Ibn Khafaja',
  '16 billion paper cups per year',
  " The Chalcedoni Terminal 1's.",
  '95%',
  '950',
  'No Answer',
  '1639',
  ' District-free Towns/Cities (Kreisfreie Städte).'])

In [77]:
def extract_yes_or_no(text):
    if "yes" in text.lower():
        return "Yes"
    elif "no" in text.lower():
        return "No"
    else:
        return None

classify_with_generation_template = """Given the following context:
CONTEXT: {context}
And the following question
QUESTION: {question}
Can you answer the question? (YES/NO):
"""

results_classify, original_preds_class = model.generate(
    dataset=train_set,
    prompt_template=classify_with_generation_template,
    extract_function=extract_yes_or_no, 
    num_samples=1,
    max_tokens=5,
)
results_classify, original_preds_class

(['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes'],
 [' YES',
  '\n\nYES',
  '\nYes, the poet',
  ' YES\n\nI can answer',
  '\n\nYES',
  '\nYes',
  ' YES ',
  '\n\nNo',
  '\nNo',
  ' YES'])

## Eval

In [103]:
list(train_set.get_column("answers")), results_generate, results_classify

([[{'text': '200', 'answer_start': 225}],
  [{'text': 'large-scale raids', 'answer_start': 359}],
  [{'text': 'Ibn Khafaja', 'answer_start': 424}],
  None,
  None,
  [],
  None,
  [],
  [{'text': '1639', 'answer_start': 172}],
  []],
 ['200 trucks',
  '\nLarge-scale raids were conducted, in which the use of mines and incendiaries for tactical expediency came close to indiscriminate bombing.',
  'Ibn Khafaja',
  '16 billion paper cups per year',
  " The Chalcedoni Terminal 1's.",
  '95%',
  '950',
  '',
  '1639',
  ' District-free Towns/Cities (Kreisfreie Städte).'],
 ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes'])

In [104]:
import evaluation.squad_eval as squad_eval_module
reload(squad_eval_module)
from evaluation.squad_eval import get_raw_scores_mod

preds = dict(zip(train_set.get_column("id"), results_generate))
raws_scores, f1_scores = get_raw_scores_mod(train_set, preds)

raws_scores, f1_scores

({'5727e04eff5b5019007d974b': 0,
  '57303ce9a23a5019007fcfdc': 0,
  '570d1e05b3d812140066d43b': 1,
  '56e7691537bdd419002c3f6f': 0,
  '5728c4de4b864d1900164da2': 0,
  '5ad1864c645df0001a2d1e8e': 0,
  '573393184776f41900660daa': 0,
  '5a4e9663755ab9001a10f51c': 1,
  '5727dcc53acd2414000dee4a': 1,
  '5a516e40ce860b001aa3fdaa': 0},
 {'5727e04eff5b5019007d974b': 0.6666666666666666,
  '57303ce9a23a5019007fcfdc': 0.1904761904761905,
  '570d1e05b3d812140066d43b': 1.0,
  '56e7691537bdd419002c3f6f': 0,
  '5728c4de4b864d1900164da2': 0,
  '5ad1864c645df0001a2d1e8e': 0,
  '573393184776f41900660daa': 0,
  '5a4e9663755ab9001a10f51c': 1,
  '5727dcc53acd2414000dee4a': 1.0,
  '5a516e40ce860b001aa3fdaa': 0})

In [223]:
print("model output= ", results_generate)
print("ground truth= ", list(train_set.get_base_answer()))
correct, accuracy = evaluate_generation(train_set, results_generate)
correct, accuracy


model output=  ['100', '12', '8', '32', '60,000', '90', '60', '38', '37,000', '482']
ground truth=  [25.0, 36.0, 17.0, 64.0, 4800.0, 110.0, 70.0, 6.0, 46000.0, 750.0]


(array([False, False, False, False, False, False, False, False, False,
        False]),
 0.0)

In [224]:
to_choice_mapping = {
    "not enough information": "No",
    "enough information": "Yes",
}
results_classify, list(train_set.get_type(to_choice_mapping))

(['Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No'],
 ['Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No'])

In [225]:
correct, accuracy = evaluate_classification(
    train_set,
    results_classify, # type: ignore
    to_choice_mapping=to_choice_mapping,
)
correct, accuracy

(array([ True, False, False, False,  True,  True,  True,  True, False,
         True]),
 0.6)

In [19]:
get_confusion_matrix(
    predict=["unknown", "10", "11", "9", "unknown"],
    original=["unknown", "9", "11", "unknown", "unknown"],
    unknown_value = "unknown"
)

(array([[1., 0.],
        [0., 2.],
        [1., 1.]]),
 0)