In [92]:
!pip install gradio



### This code is used to randomly extract a subset of samples from the train.jsonl file for small-scale testing.

In [111]:
import json
import random

def sample_questions_and_answers(filepath, sample_size=3):
    # Read all questions and answers into two separate lists
    questions = []
    answers = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            qa_pair = json.loads(line.strip())
            questions.append(qa_pair['question'])
            answers.append(qa_pair['answer'])

    # If the number of questions in the file is less than the requested sample size, print a warning
    if len(questions) < sample_size:
        print("Warning: The file contains fewer questions than the requested sample size.")
        sample_size = len(questions)

    # Randomly select sample indices
    sampled_indices = random.sample(range(len(questions)), sample_size)
    # Extract questions and answers based on the indices
    question_sample = [questions[i] for i in sampled_indices]
    answer_sample = [answers[i] for i in sampled_indices]

    return question_sample, answer_sample

# Call the function and print the results
filepath = 'train.jsonl'  # Replace with your file path
question_sample, answer_sample = sample_questions_and_answers(filepath)

for question in question_sample:
    print("Question:", question)
print("----")
for answer in answer_sample:
    print("Answer:", answer)


Question: Ryan is looking for people to crowdfund his new business idea.  If the average person funds $10 to a project they're interested in, how many people will Ryan have to recruit to fund a $1,000 business if he has $200 already?
Question: John buys 1.5 ounces of silver and twice as much gold.  The silver costs $20 per ounce.  The gold is 50 times more expensive per ounce.  How much does he spend on everything?
Question: It’s spring! The gardener at Parc Municipal orders flowers. He takes 250 tulips, 375 carnations and 320 roses. Each flower is sold for 2€. Calculate the total expenses.
----
Answer: First, we need to determine how much money Ryan needs to hit his goal which we find by subtracting his available cash from his target, performing 1000-200=<<1000-200=800>>800 dollars needed.
Since the average person contributes $10 when crowdfunding, this means he needs to find 800/10=<<800/10=80>>80 people to fund his business well enough for him to hit his goal.
#### 80
Answer: He buy

### Generate response for the data we extracted

In [112]:
import os
os.environ["OPENAI_API_KEY"] = "" #Your Openai API Key

In [113]:
! pip install openai



In [114]:
import openai
from datetime import datetime
from openai import OpenAI

client = OpenAI()

def generate_gpt_answers(questions):
    client = OpenAI(api_key='') # Your OpenAI API key
    GPT_MODEL = "gpt-3.5-turbo"  # Can be replaced with the latest model version
    response_sample = []

    for question in questions:
        messages = [
            {"role": "system", "content": 'You are a tutor answering math questions.'},
            {"role": "user", "content": question},
        ]
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            temperature=0.5  # Adjust the temperature parameter to change the creativity of the answers
        )
        response_message = response.choices[0].message.content
        response_sample.append(response_message)

    return response_sample

response_sample = generate_gpt_answers(question_sample)

for response in response_sample:
    print("GPT Response:", response)
    print("----")


GPT Response: Ryan needs to raise $1,000 for his business idea and he already has $200. Therefore, he still needs to raise $1,000 - $200 = $800 more.

If the average person funds $10, then Ryan will need $800 / $10 = 80 people to fund his business.
----
GPT Response: John buys 1.5 ounces of silver at $20 per ounce, so he spends 1.5 * $20 = $30 on silver.
He buys twice as much gold as silver, so he buys 1.5 * 2 = 3 ounces of gold.
Gold is 50 times more expensive per ounce than silver, so it costs 50 * $20 = $1000 per ounce.
So he spends 3 * $1000 = $3000 on gold.
Therefore, John spends $30 + $3000 = $3030 on everything.
----
GPT Response: To find the total expenses, we first need to calculate the total number of flowers ordered:

Total number of flowers = 250 tulips + 375 carnations + 320 roses
Total number of flowers = 945 flowers

Next, we calculate the total cost of all the flowers:

Total cost = Total number of flowers * Cost per flower
Total cost = 945 flowers * 2€/flower
Total cos

In [118]:
def standardize_answers(response_sample):
    client = OpenAI(api_key='')
    GPT_MODEL = "gpt-3.5-turbo-1106"  # Can be replaced with the latest model version
    standardize_sample = []

    # Construct standardization requests for each answer
    for response in response_sample:
        # Send request to the GPT model to standardize the answer
        messages = [
            {"role": "system", "content": 'Extract and format the answer in a standardized form, enclosed within \\boxed{}. OUTPUT ONLY THE RESULT NUMBER, DO NOT ADD ADDITIONAL CONTENT.'},
            {"role": "user", "content": response},
        ]
        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=messages,
            temperature=0  # Use temperature 0 to get the most consistent output
        )
        response_message = response.choices[0].message.content
        # Extract only the content within \boxed{}
        if '\\boxed{' in response_message:
            # Use simple text processing to extract the content within \boxed{}
            start = response_message.find('\\boxed{') + len('\\boxed{')
            end = response_message.find('}', start)
            boxed_content = response_message[start:end]
            standardized_response = f"\\boxed{{{boxed_content}}}"
        else:
            # If the standard format cannot be recognized, return the original response, though this should not happen
            standardized_response = f"\\boxed{{}}"

        standardize_sample.append(standardized_response)

    return standardize_sample

standardized_sample = standardize_answers(response_sample)

for answer in standardized_sample:
    print("Standardized Answer:", answer)


Standardized Answer: \boxed{}
Standardized Answer: \boxed{3030}
Standardized Answer: \boxed{1890}


In [119]:
def create_eval_json_file(question_sample, answer_sample, response_sample, standardized_sample, filepath="eval_result.json"):
    with open(filepath, "w", encoding='utf-8') as f:
        for idx, (question, answer, response, standardized_answer) in enumerate(zip(question_sample, answer_sample, response_sample, standardized_sample)):

            # Extract the text after '###' from the answer as the final answer
            final_answer = answer.split('###')[-1].strip().lstrip('# ')

            # Extract the number or text from standardized_answer
            if '\\boxed{' in standardized_answer:
                start = standardized_answer.find('\\boxed{') + len('\\boxed{')
                end = standardized_answer.find('}', start)
                standardized_answer_content = standardized_answer[start:end]
            else:
                standardized_answer_content = ''  # Return an empty string if the format is incorrect

            # Compare the final answer with the standardized answer to check for consistency
            is_correct = (final_answer == standardized_answer_content)

            # Construct JSON object
            data = {
                "question": question,
                "answer": answer,
                "pred_answer": response,
                "standardized_answer": standardized_answer,
                "is_correct": is_correct  # Update based on the answer comparison result
            }
            # Write to file
            json_line = json.dumps(data)
            f.write(json_line + "\n")

# Create JSONL file
create_eval_json_file(question_sample, answer_sample, response_sample, standardized_sample)


### Evaluate the correctness of the answer generate by GPT

In [125]:
# -*- coding: utf-8 -*-
# Human Evaluation for GSM8K
import json
import gradio as gr

def read_samples():
    with open("eval_result.json", "r") as f:
        data = f.readlines()

    content = []
    cnt = 0
    for sample in data:
        sample_dict = json.loads(sample.strip())
        # Modified logic: include samples that do not contain '\\boxed' or have 'is_correct' as false
        if ('\\boxed' not in sample_dict['standardized_answer']) or (not sample_dict['is_correct']):
            cnt += 1
            # Add standardized_answer column
            content.append([cnt,
                            sample_dict['question'],
                            sample_dict['answer'].split('####')[-1].strip(),
                            sample_dict['standardized_answer'],  # Show standardized answer
                            sample_dict['pred_answer'],
                            'F'])  # Default value for human evaluation
    return content

def get_human_eval(df):
    # get model evaluation
    with open("eval_result.json", "r") as f:
        data = f.readlines()

    model_true_cnt = 0
    for sample in data:
        sample_dict = json.loads(sample.strip())
        if '\\boxed' in sample_dict['standardized_answer'] and sample_dict['is_correct']:
            model_true_cnt += 1
    # get human evaluation
    human_true_cnt = 0
    for i, row in df.iterrows():
        # Check if Human Evaluation is 'T' for True
        if row['Human Evaluation'] == 'T':
            human_true_cnt += 1
    total_entries = len(data)
    total_correct = model_true_cnt + human_true_cnt
    total_accuracy = total_correct / total_entries
    return (f"Update {human_true_cnt} samples with human evaluation, \n"
            f"Total Accuracy: {total_correct}/{total_entries} = {total_accuracy:.2%}")

with gr.Blocks() as demo:
    with gr.Column():
        with gr.Row():
            table = gr.DataFrame(label='Table',
                                 value=read_samples(),
                                 headers=['No.', 'Question', 'Answer', 'Standardized Answer', 'Prediction', 'Human Evaluation'],
                                 interactive=True,
                                 wrap=True)
        with gr.Row():
            output = gr.Textbox(label='Human Evaluation')
            submit = gr.Button("Search")

        submit.click(fn=get_human_eval,
                     inputs=table,
                     outputs=output)

demo.launch()




Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://f333a63cf1978ab608.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


