In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import json
import random
from http import HTTPStatus
import dashscope
from dashscope.api_entities.dashscope_response import Role

In [2]:
class AQuADataset(Dataset):
    def __init__(self, data_path, max_records=10):
        with open(data_path, 'r') as file:
            data = [json.loads(line) for line in file]
        data = data[:max_records]
        df = pd.DataFrame(data)
        df = df.head(max_records)
        
        def combine(row):
            choices = [f"{chr(65 + i)}: {option}" for i, option in enumerate(row['options'])]
            return f"Question: {row['question']} Choices: {', '.join(choices)}"
        
        self.inputs = df.apply(combine, axis=1).tolist()
        self.labels = [row['correct'] for row in data]
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

# 修改数据路径
aqua_data_path = 'train.json'

# 实例化AQuA数据集
aqua_dataset = AQuADataset(aqua_data_path)

# 创建DataLoader
aqua_dataloader = DataLoader(aqua_dataset, batch_size=10, shuffle=False)

In [3]:
for batch in aqua_dataloader:
    inputs, labels = batch
    print(f"Inputs: {inputs}")
    print(f"Labels: {labels}")
    break

# 以下代码保持不变
dashscope.api_key = "sk-ea4a0e21cc10489f920bbbf3863d1475"

Inputs: ("Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other? Choices: A: A)21, B: B)21.5, C: C)22, D: D)22.5, E: E)23", 'Question: In the coordinate plane, points (x, 1) and (5, y) are on line k. If line k passes through the origin and has slope 1/5, then what are the values of x and y respectively? Choices: A: A)4 and 1, B: B)1 and 5, C: C)5 and 1, D: D)3 and 5, E: E)5 and 3', 'Question: For all numbers p and q, the operation @ is defined by p@q = p^2 - pq. If xy ≠ 0, then which of the following can be equal to zero?\nI. x@y\nII. (xy)@y\nIII. x@(x + y) Choices: A: A)II, B: B)I and II, C: C)I and III, D: D)II and III, E: E)All of the above', 'Question: Carl is facing very difficult financial times and can only pay the interest on a $10,000 loan he has taken. The bank charges him a quarterly compound rate

In [4]:
def call_with_messages(content: str):
    messages = [{'role': 'user', 'content': content}]
    response = dashscope.Generation.call(
        'qwen-long',    
        messages=messages,    
        result_format='message',   
        max_tokens=300
    )
    if response.status_code == HTTPStatus.OK:
        return response.output.choices[0]['message']['content']
    else:
        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
            response.request_id, response.status_code,
            response.code, response.message
    ))

def inference(input):
    outputs = call_with_messages(input)
    # print("outputs:", outputs)
    return outputs

trigger_sentences = [
    "Let's think step by step.",
    "We should think about this step by step.",
    "First,",
    "Before we dive into the answer,",
    "Proof followed by the answer.",
    "Let's think step by step in a realistic way.",
    "Let's think step by step using common sense and knowledge.",
    "Let's think like a detective step by step.",
    "Let's think about this logically.",
    "Let's think step by step. First,",
    "Let's think",
    "Let's solve this problem by splitting it into steps.",
    "The answer is after the proof.",
    "Let's be realistic and think step by step."
]


In [5]:
epochs = 1
num_paths = 5  # Number of reasoning paths to generate
accuracies = []


In [6]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [7]:
# for epoch in range(epochs):
#     correct_count = 0
#     total_count = 0
    
#     for batch in aqua_dataloader:  # 修改为aqua_dataloader
#         questions, true_labels = batch
#         print(true_labels)
#         predicted_answers = []
        
#         for question in questions:
#             path_answers = []
#             for _ in range(num_paths):
#                 trigger_sentence = random.choice(trigger_sentences)
#                 X0 = f"Q: {question}. A: {trigger_sentence}"
#                 # print(X0)
#                 Z = inference(X0)
#                 # print(Z)
#                 second_prompt = f"{X0} {Z} Therefore, the answer (alphabet) is"
#                 # print(second_prompt)
#                 final_answer = inference(second_prompt)
#                 # print(final_answer)
#                 predicted_answer = next((char for char in reversed(final_answer) if char.isupper()), None)
#                 path_answers.append(predicted_answer)
            
#             # Use self-consistency to select the most common answer
#             print(path_answers)
#             final_predicted_answer = most_common(path_answers)
#             print(f"Final Predicted Answer: {final_predicted_answer}")
            
#             predicted_answers.append(final_predicted_answer)
        
#         batch_correct_count = sum(1 for predicted, true in zip(predicted_answers, true_labels) if predicted == true)
#         batch_total_count = len(true_labels)
#         correct_count += batch_correct_count
#         total_count += batch_total_count
        
#         print(f"Predicted Answers: {predicted_answers}")
#         print(f"True Labels: {true_labels}")
#         break
    
#     accuracy = correct_count / total_count
#     accuracies.append(accuracy)
#     print(f"Epoch {epoch + 1}/{epochs} Accuracy: {accuracy * 100}%")

# average_accuracy = sum(accuracies) / epochs
# print(f"Average Accuracy over {epochs} epochs: {average_accuracy * 100}%")

In [8]:
for epoch in range(epochs):
    correct_count = 0
    total_count = 0
    
    for batch in aqua_dataloader:  # 修改为aqua_dataloader
        inputs, labels = batch

        for question, true_label in zip(inputs, labels):
            print(true_label)
            path_answers = []
            for _ in range(num_paths):
                trigger_sentence = random.choice(trigger_sentences)
                X0 = f"Q: {question} A: {trigger_sentence}"
                Z = inference(X0)
                final_prompt = f"{X0} {Z} Therefore, among A through E, the answer is(only one alphabet, no need to write the whole word, e.g., 'A' or 'B' or 'C' or 'D' or 'E')"
                final_answer = inference(final_prompt)
                prediction = None
                for char in final_answer:
                    if char.isupper():
                        prediction = char
                        break
                path_answers.append(prediction)
            print(path_answers)
            # Use self-consistency to select the most common answer
            final_prediction = most_common(path_answers)
            if final_prediction == true_label:
                correct_count += 1
            total_count += 1
        break

    accuracy = correct_count / total_count if total_count > 0 else 0
    accuracies.append(accuracy)
    print(f"Epoch {epoch + 1}/{epochs} - Accuracy: {accuracy * 100:.2f}%")

print("Accuracies over epochs:", accuracies)

E
['D', 'D', 'D', 'C', 'D']
C
['C', 'C', 'C', 'C', 'C']
B
['D', 'B', 'A', 'C', 'B']
A
['C', 'C', 'C', 'A', 'C']
E
['D', 'E', 'E', 'E', 'B']
A
['C', 'C', 'A', 'A', 'C']
A
['A', 'A', 'A', 'B', 'B']
E
['E', 'C', 'E', 'C', 'C']
B
['E', 'B', 'A', 'C', 'C']
C
['C', 'C', 'C', 'C', 'C']
Epoch 1/1 - Accuracy: 50.00%
Accuracies over epochs: [0.5]
