In [12]:
from importlib import reload
import data.loader as loader_module
import evaluation.predict as predict_module
import evaluation.eval as eval_module

reload(predict_module)
reload(eval_module)
reload(loader_module)

from evaluation.predict import ModelPredictor
from evaluation.eval import evaluate_generation, evaluate_classification, get_confusion_matrix
from data.loader import Dataset, get_dataset

## Load dataset

In [93]:
train_set, test_set = get_dataset(
    10,
    split_percentage=1.0,
    percent_files=(0.34, 0.33, 0.33),
    file_names=("base.json", "replace.json", "which.json"),
)
print(len(train_set))
print(train_set[-1])
train_set.data[-1]


10
Rose had some kilograms of rice. She cooked 9/10 kilograms in the morning and 1/4 of the remaining in the evening. How many grams of rice did she have left?


{'question': 'Rose had some kilograms of rice. She cooked 9/10 kilograms in the morning and 1/4 of the remaining in the evening. How many grams of rice did she have left?',
 'type': 'not enough information',
 'base answer': 750.0,
 'file_origin': 'replace.json'}

In [102]:
for i in range(min(len(train_set), 10)):
    print(train_set.data[i])


{'question': 'A lumberjack is chopping down trees so he can sell firewood. Each tree he chops produces 4 logs each, and each log is then chopped into 5 pieces of firewood. If the lumberjack has chopped 500 pieces of firewood, how many trees did he chop down?', 'type': 'enough information', 'base answer': 25.0, 'file_origin': 'base.json'}
{'question': 'May can knit some scarves using one yarn. She bought 2 red yarns, 6 blue yarns, and 4 yellow yarns. How many scarves will she be able to make in total?', 'type': 'not enough information', 'base answer': 36.0, 'file_origin': 'replace.json'}
{'question': 'Bill and Ted went into the forest to gather some wild mushrooms. Bill gathered 12 red mushrooms and 6 brown mushrooms.  Ted gathered 14 green mushrooms and 6 blue mushrooms.  If half of the blue mushrooms, two-thirds of the red mushrooms, and all of the brown mushrooms have white spots, how many white-spotted mushrooms did they gather?', 'type': 'enough information', 'base answer': 17.0, '

## Predict

In [218]:
model_name = "gpt-3.5-turbo-instruct"
model = ModelPredictor(model_name)


In [215]:
import re
def extract_numbers_from_text(text):
    try:
        return re.findall(
                r"(?<!\S)(?=.)[\$]?(-?(0|([1-9](\d*|\d{0,2}(,\d{3})*))))?(\.\d*[1-9])?(?!\S)", text
            )[-1][0]
    except IndexError:
        return None

In [219]:
prompt_template = """What is the answer (just the number nothing else) for the following question:
QUESTION: {question}
ANSWER: """

results_generate, original = model.generate(
    dataset=train_set,
    prompt_template=prompt_template,
    extract_function=extract_numbers_from_text, 
    num_samples=1,
    num_tokens=5,
)
results_generate, original

(['100', '12', '8', '32', '60,000', '90', '60', '38', '37,000', '482'],
 ['100 trees\n',
  '12',
  '8',
  '32',
  '60,000',
  '90',
  '60',
  '38',
  '$37,000',
  '482.5 grams'])

In [222]:
def extract_yes_or_no(text):
    if "yes" in text.lower():
        return "Yes"
    elif "no" in text.lower():
        return "No"
    else:
        return None

classify_with_generation_template = """Is there enough information to answer this question? 
QUESTION: {question}
-- Only answer with yes or no --
ANSWER: """

results_classify, original = model.generate(
    dataset=train_set,
    prompt_template=classify_with_generation_template,
    extract_function=extract_yes_or_no, 
    num_samples=1,
    num_tokens=5,
)
results_classify, original

(['Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No'],
 [' Yes', 'Yes', 'No ', 'No', ' No', 'No', 'Yes', 'No', 'Yes', ' No '])

In [208]:
# This only works with certain older models!
model_name = "davinci-002" 
model = ModelPredictor(model_name)

classify_template = """Is there enough information to answer this question? 
{question}
ANSWER: {choice}"""
choices = ["No", "Yes"]

results_classify = model.classify(
    dataset=train_set,
    prompt_template=classify_template,
    labels=choices,
    return_probs=False,
)
results_classify

['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes']

## Eval

In [223]:
print("model output= ", results_generate)
print("ground truth= ", list(train_set.get_base_answer()))
correct, accuracy = evaluate_generation(train_set, results_generate)
correct, accuracy


model output=  ['100', '12', '8', '32', '60,000', '90', '60', '38', '37,000', '482']
ground truth=  [25.0, 36.0, 17.0, 64.0, 4800.0, 110.0, 70.0, 6.0, 46000.0, 750.0]


(array([False, False, False, False, False, False, False, False, False,
        False]),
 0.0)

In [224]:
to_choice_mapping = {
    "not enough information": "No",
    "enough information": "Yes",
}
results_classify, list(train_set.get_type(to_choice_mapping))

(['Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No'],
 ['Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No'])

In [225]:
correct, accuracy = evaluate_classification(
    train_set,
    results_classify, # type: ignore
    to_choice_mapping=to_choice_mapping,
)
correct, accuracy

(array([ True, False, False, False,  True,  True,  True,  True, False,
         True]),
 0.6)

In [16]:
get_confusion_matrix(
    predict=["unknown", "10", "11", "9", "unknown"],
    original=["unknown", "9", "11", "unknown", "unknown"],
    unknown_value = "unknown"
)

array([[1., 0.],
       [0., 2.],
       [1., 1.]])