# process data

In [None]:
from datasets import load_dataset
data = load_dataset('econ logic qa data')

In [None]:
data['train'][0]

In [None]:
query = []
answer = []

prompt_template = """Given a specific process, event, or scenario, along with a set of unordered options, arrange them into a logically ordered sequence. 
The sequence should follow a natural progression where each step builds upon the previous one.
Question: {}
Options: 
A: {}
B: {}
C: {}
D: {}
Answer:"""

for i in data['train']:
    query.append(prompt_template.format(i['Question'], i['A'], i['B'], i['C'], i['D']))
    answer.append(i['Answer'])
    #print(formatted_prompt)  # Print or store it in a list for later use


In [None]:
import pandas as pd
df = pd.DataFrame({'query': query, 'text': data['train']['Question'],'answer': answer, 'query_code': 'No', 'program': 'No'})

In [None]:
df

In [None]:
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi

# 将 Pandas DataFrame 转换为 Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)

# 定义数据集 repo 名称
dataset_repo = "XXXX/XXXo1"

# 推送数据集到 Hugging Face Hub
hf_dataset.push_to_hub(dataset_repo)

print(f"Dataset uploaded to: https://huggingface.co/datasets/{dataset_repo}")

# filter-step1 

In [None]:
import os
from tenacity import retry, stop_after_attempt, wait_fixed
import requests
import re
import json
import traceback
# 设置 Hugging Face API Token
os.environ["OPENAI_API_SECRET_KEY"] = "your openai api key here"

os.environ["DEEPSEEK_API_SECRET_KEY"] = "your deepseek api key here"
os.environ['OPENAI_URL'] = "https://api.openai.com/v1/chat/completions"
os.environ['DEEPSEEK_URL'] = "https://api.deepseek.com/v1/chat/completions"

os.environ["HF_TOKEN"] = "your hugging face token here"

class GPT:
    def __init__(self, model_name, api_url, api_key):
        self.model_name = model_name
        self.api_url = api_url
        self.api_key = api_key
        print(f"Using model: {self.model_name}")

    def call(self, content, additional_args={}):
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }
        payload = {
            "model": self.model_name,
            "messages": [{'role': 'user', 'content': content}],
            **additional_args,
        }
        response = requests.post(self.api_url, headers=headers, json=payload)
        response_data = response.json()

        if 'error' in response_data:
            raise ValueError(f"API Error: {response_data}")

        return response_data['choices'][0]['message']['content']

    @retry(wait=wait_fixed(3), stop=stop_after_attempt(3))
    def retry_call(self, content, additional_args={"max_tokens": 8192}):
        return self.call(content, additional_args)

In [None]:
gpt_35 = GPT(model_name='gpt-4o-mini', api_url=os.environ['OPENAI_URL'], api_key=os.environ["OPENAI_API_SECRET_KEY"])

In [None]:
gpt_35.retry_call('hi')

In [None]:
verify_prompt = """<Model Response>  
{}  
</Model Response>  

<Reference Answer>  
{}
</Reference Answer> 

You are provided with a model-generated response (<Model Response>) and a reference answer (<Reference Answer>). Compare the model response with the reference answer and determine its correctness. Please be mercy when judging. Your task is to simply output "True" if the response is correct, and "False" otherwise."""

In [None]:
def verify_gpt(conclusion,answer):
    query = verify_prompt.format(conclusion,answer)
    response = gpt_35.retry_call(query)
    if 'true' in response.lower():
#        d['verify'].append(True)
        return True
    else:
#        d['verify'].append(False)
        return False

In [None]:
import pandas as pd
file_name = 'your jsonl dataset path' # For example o1-step-1-result/samples_o1_step1_2025-02-18T05-29-03.441995.jsonl

In [None]:
data = []
with open(file_name, 'r') as file:
    for line in file:
        item = json.loads(line.strip())
        data.append(item)
gold = [item['target'] for item in data]
predict = [item['resps'][0][0].split('<|eot_id|>')[0] for item in data]
inputs = [item['doc']['query'] for item in data]

In [None]:
from tqdm import tqdm
verify = []
for pre,gol in tqdm(zip(predict, gold)):
    verify.append(verify_gpt(pre, gol))

In [None]:
df = pd.DataFrame({
    "inputs": inputs,
    "gold": gold,
    "predict": predict,
    "verify": verify
})

In [None]:
df.to_csv('verified_o1_step1.csv')

In [None]:
df_correct = df[df.verify == True].reset_index(drop=True)
df_wrong = df[df.verify == False].reset_index(drop=True)

# merge data

In [None]:
from datasets import load_dataset
import pandas as pd
df_finqa = pd.DataFrame(load_dataset('FINQA test data')['train'])[['query', 'answer', 'query_code', 'program', 'text']]
df_dm = pd.DataFrame(load_dataset('DocMath-Eval data')['train'])[['query', 'answer', 'query_code', 'program', 'text']]
df_tat = pd.DataFrame(load_dataset('FLARE TATQA test data')['train'])[['query', 'answer', 'query_code', 'program', 'text']]

df_docfinqa = pd.DataFrame(load_dataset('DocFinQA data')['train'])[['query', 'answer', 'query_code', 'program', 'text']]
df_bizqa = pd.DataFrame(load_dataset('bizbench QA data')['train'])[['qucry', 'answer', 'query_code', 'program', 'text']]

In [None]:
df_docfinqa = df_docfinqa[df_docfinqa.program=='No'].reset_index(drop=True)
df_bizqa = df_bizqa[df_bizqa.program=='No'].reset_index(drop=True)

In [None]:
df_combined = pd.concat([df_finqa, df_dm, df_tat, df_logic, df_docfinqa[:1000],df_bizqa[:1000]], ignore_index=True)
df_filtered = df_combined[df_combined['answer'].astype(str).str.len() <= 10].reset_index(drop=True)

In [None]:
df_filtered.to_csv('o1.csv')

In [None]:
import pandas as pd
df = pd.read_csv('verified_o1_step1.csv')

In [None]:
query2result = {}
for i in df.iloc:
    query2result[i['inputs']] = i['verify']

In [None]:
verify = []
for i in df_filtered.iloc:
    if i['query'] in query2result:
        verify.append(query2result[i['query']])
    else:
        verify.append(False)

df_filtered['verify'] = verify

In [None]:
df_filtered = df_filtered[df_filtered.verify==False].reset_index(drop=True)
df_filtered = df_filtered[df_filtered['answer'].astype(str).str.len() <= 10].reset_index(drop=True)
df_filtered = df_filtered.drop(['query_code', 'program'],axis=1)

In [None]:
df_filtered.to_csv('filtered_o1_training.csv', index=False)

# merge with CONVFINQA

In [None]:
import pandas as pd
df = df_filtered #pd.read_csv('filtered_o1_training.csv')

In [None]:
from datasets import load_dataset
conv = pd.DataFrame(load_dataset('XXXX/convfinqa')['train'])

conv.columns = ['id', 'query','answer', 'turn', 'dialogue_id']
df_max_turn = conv.loc[conv.groupby('dialogue_id')['turn'].idxmax()].reset_index(drop=True)
df_max_turn['conver'] = df_max_turn['query'].apply(lambda x: x.split('\nConversations: \n')[1])

df = pd.concat([df, df_max_turn])
df = df.reset_index(drop=True)
df = df.fillna('No')
df = df.drop('id,turn,dialogue_id'.split(','),axis=1)
df['query'] = df['query'].apply(lambda x: x.split('\nConversations: \n')[0])
df = df.drop(['verify'],axis=1)

In [None]:
df_combined=df_combined.fillna('No')
df_combined.to_csv('conv_and_filtered.csv', index=False)

In [None]:
import pandas as pd


# 提取 500-700 字符片段
def extract_substring(text):
    return text[500:] if len(text) > 700 else text[500:]

df_combined["substring"] = df_combined["query"].apply(extract_substring)

# 找到重复的片段，只保留最后一个
df_filtered = df_combined[~df_combined.duplicated(subset=["substring"], keep="last")].drop(columns=["substring"])

In [None]:
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
df_filtered.to_csv('conv_and_filtered_remove_duplicated.csv', index=False)

In [None]:
# print(df_filtered['query'][0]) #['query'][0][500:]

# filter data

In [None]:
import pandas as pd
df_filtered = pd.read_csv('conv_and_filtered_remove_duplicated.csv')

In [None]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

def n_gram_similarity(str1, str2, n=2):
    """计算两个字符串的 N-gram 相似度"""
    if len(str1) < n or len(str2) < n:
        return 0.0
    
    set1 = {str1[i:i+n] for i in range(len(str1) - n + 1)}
    set2 = {str2[i:i+n] for i in range(len(str2) - n + 1)}
    
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    return intersection / union if union > 0 else 0.0

def jaccard_similarity(str1, str2):
    """计算两个字符串的 Jaccard 相似度"""
    set1 = set(str1.split())
    set2 = set(str2.split())
    
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    return intersection / union if union > 0 else 0.0

def deduplicate_queries(df_filtered, query_column="query", similarity_threshold=0.8, similarity_func=n_gram_similarity):
    """在相似的 query 中保留最后一个出现的 query"""
    if query_column not in df_filtered.columns:
        raise ValueError(f"Column '{query_column}' not found in DataFrame")
    
    df_filtered = df_filtered.copy()
    df_filtered["index"] = df_filtered.index  # 保留原始索引
    df_filtered.sort_index(ascending=False, inplace=True)  # 按索引降序排列
    
    unique_queries = []
    removed_indices = set()
    
    for i, row in tqdm(df_filtered.iterrows()):
        if i in removed_indices:
            continue
        
        current_query = row[query_column]
        
        for j, compare_row in df_filtered.iterrows():
            #continue
            if i == j or j in removed_indices:
                continue
            
            compare_query = compare_row[query_column]
            similarity = similarity_func(current_query, compare_query)
            
            if similarity >= similarity_threshold:
                removed_indices.add(j)  # 移除相似的较早出现的 query
        
        unique_queries.append(row)
    
    df_result = pd.DataFrame(unique_queries).sort_index()
    df_result.drop(columns=["index"], inplace=True)
    
    return df_result

# 示例用法
df = pd.DataFrame(df_filtered)
#df_filtered_jaccard = deduplicate_queries(df, query_column="query", similarity_threshold=0.7, similarity_func=jaccard_similarity)
df_filtered_ngram = deduplicate_queries(df, query_column="query", similarity_threshold=0.7, similarity_func=lambda s1, s2: n_gram_similarity(s1, s2, n=2))

# print("Jaccard 去重结果:")
# print(df_filtered_jaccard)
# print("\nN-gram 去重结果:")
# print(df_filtered_ngram)

In [None]:
flag=[]
for i in df_filtered['query']:
    if "Question" not in i:
        #print(i)
        flag.append(1)
    else:
        flag.append(0)
df_filtered['flag'] = flag
df_filtered0 = df_filtered[df_filtered['flag']==0]

df_filtered1 = df_filtered[df_filtered['flag']==1]

In [None]:
import openai
import os
api_key="your openai api key here"

client_openai = openai.OpenAI(api_key=api_key)
def merge_questions_with_gpt4o(questions: str) -> str:
    """
    Uses GPT-4o to merge decomposed questions into the original question.

    Parameters:
        questions (str): A string containing a series of decomposed questions.
        api_key (str): Your OpenAI API key.

    Returns:
        str: The merged original question.
    """
    prompt = '''i will give you a series of questions, these questions are a list of decomposed questions for the original question, 
    please help me merge all these questions into the original question. Do not mentioned any of the answers in the output. The fist pirorty is to make sure the answer to your questions is the only and exact the answer for the last question. Only return the merged original question.'''

    response = client_openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": questions}
        ],
            )
    print(response.choices[0].message.content)
    return response.choices[0].message.content

In [None]:
df_filtered1['text'] = df_filtered1['conver'].apply(lambda x: merge_questions_with_gpt4o(x))

In [None]:
df_filtered1["query"] = df_filtered1["query"] + "\nQuestion: " + df_filtered1["text"] + "\nAnswer:"

In [None]:
df_filtered1 = df_filtered1[['query', 'answer', 'text', 'conver','flag']]

In [None]:
df_filtered0['text'] = df_filtered0['query'].apply(lambda x: x.split('Question')[1].split('Answer')[0].replace(':','').replace(' ','').replace('\n',''))

In [None]:
df_filtered00 = df_filtered0.drop_duplicates(subset=['text', 'answer'], keep='last')

In [None]:
df_filtered = pd.concat([df_filtered00,df_filtered1])
df_filtered = df_filtered.drop('flag',axis=1)
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
maps = {}
for i in df_filtered00['query']:
    maps[i.replace('\t', '')] = i.replace('\t', ' ')

In [None]:
df_filtered['query'] = df_filtered['query'].apply(lambda x:x.replace('\t', ' '))

In [None]:
df_filtered.columns = ['Open-ended Verifiable Question', 'Ground-True Answer', 'question', 'conver']

In [None]:
df_filtered['question'] = df_filtered['Open-ended Verifiable Question'].apply(lambda x: x.split('Question')[1].split('Answer')[0].replace(':',''))

In [None]:
df_filtered.to_csv('conv_and_filtered_remove_duplicated.csv', index=False)

In [None]:
import pandas as pd

# 假设 df 是你的 DataFrame
# 计算 token 数量（假设 token 以空格分隔）
df_filtered["token_count"] = df_filtered["Open-ended Verifiable Question"].apply(lambda x: len(str(x).split()))

# 过滤掉 token 数量超过 20000 的行
df_filtered1 = df_filtered[df_filtered["token_count"] <= 10000].reset_index(drop=True)

# 删除临时列
df_filtered1 = df_filtered1.drop(columns=["token_count"])

In [None]:
df_filtered1.to_csv('conv_and_filtered_remove_duplicated_reomve_long.csv', index=False)

In [None]:
df_filtered1

# upload repo

In [None]:
dataset = load_dataset("XXXX/Fino1_Reasoning_FinQA")

# 2. 将需要的切分（例如 train）转换为 Pandas DataFrame 
#    注意：具体使用哪个 split，要根据实际情况来改，例如 'train'、'test' 或 'validation'
df_fino1_reasoning = dataset['train'].to_pandas()

df_filtered1['covered'] = df_filtered1['Open-ended Verifiable Question'].isin(
    df_fino1_reasoning['Open-ended Verifiable Question']
).astype(int)

In [None]:
import pandas as pd

# 先给所有行加上一列 flag，默认值为 0
df_filtered1["flag"] = 0

# 在 cover=0 的行中，随机抽取 1000 条索引
random_indices = df_filtered1[df_filtered1["covered"] == 0] \
    .sample(n=1000, random_state=42).index

# 将抽取出来的这 1000 行对应的 flag 设置为 1
df_filtered1.loc[random_indices, "flag"] = 1

# 创建一个新数据集，仅包含 flag=1 的行
df_filtered1_random = df_filtered1[df_filtered1["flag"] == 1]

In [None]:
df_filtered1_1 = df_filtered1[df_filtered1.covered==0]

In [None]:
df_filtered1_2 = df_filtered1_1[df_filtered1_1.flag==0]

In [None]:
from huggingface_hub import HfApi

repo_name = 'XXXX/reasoning_data'


# 创建 API 实例
api = HfApi()

# 创建数据集仓库
dataset_repo_id = repo_name  # 替换为你的命名空间
api.create_repo(repo_id=dataset_repo_id, repo_type="dataset", exist_ok=True)

print(f"数据集仓库 {dataset_repo_id} 创建成功！")




from datasets import Dataset, DatasetDict

parquet_filename = "train.parquet"
df_filtered1_2.to_parquet(parquet_filename, engine="pyarrow")  # 确保使用 Parquet 格式

from huggingface_hub import HfApi

api = HfApi()
dataset_repo_id = repo_name

# 上传 Parquet 文件
api.upload_file(
    path_or_fileobj=parquet_filename,
    path_in_repo="train/train.parquet",  # 保证它是 train split
    repo_id=dataset_repo_id,
    repo_type="dataset",
)

print("Parquet 文件上传成功！")


In [None]:
import json

# 读取第一个文件
with open("raw_reasoning_data_v2_3_CoT_search_472_1.json", "r", encoding="utf-8") as f1:
    data1 = json.load(f1)

# 读取第二个文件
with open("raw_reasoning_data_v2_3_CoT_search_481_2.json", "r", encoding="utf-8") as f2:
    data2 = json.load(f2)

# 合并数据
merged_data = data1 + data2

# 保存为新文件
with open("merged_reasoning_data.json", "w", encoding="utf-8") as fout:
    json.dump(merged_data, fout, indent=2, ensure_ascii=False)

print("✅ 合并完成，保存为 merged_reasoning_data.json")

In [None]:
from datasets import load_dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder, upload_folder
import os

# Step 1: 加载原始数据集
dataset = load_dataset("xxxxxx/XXXX_reasoning")

# Step 2: 添加新列 "answer"（复制自 "Ground-True Answer"）
def add_answer_column(example):
    example['query'] = example["Open-ended Verifiable Question"]
    example["answer"] = example["Ground-True Answer"]
    return example

dataset = dataset.map(add_answer_column)

# Step 3: 保存为 Parquet 格式
save_dir = "reasoning_path_v2_all_parquet"
os.makedirs(save_dir, exist_ok=True)

for split in dataset.keys():
    dataset[split].to_parquet(os.path.join(save_dir, f"{split}.parquet"))

# Step 4: 上传到 Hugging Face Hub（private dataset repo）
repo_id = "XXXXXX/reasoning_path_v2"

# 创建私有 repo（如果已经存在会跳过）
api = HfApi()
api.create_repo(repo_id=repo_id, private=True, repo_type="dataset", exist_ok=True)

# 上传文件夹
upload_folder(
    repo_id=repo_id,
    folder_path=save_dir,
    repo_type="dataset"
)

print(f"✅ 数据集已成功上传为 Parquet 格式到：https://huggingface.co/datasets/{repo_id}")

# process data

In [None]:
import pandas as pd
df = pd.read_json('formated_filtered_samples_data_filter_2025-03-13T18-15-17.353081.json', lines=True)

In [None]:
from datasets import load_dataset
df_5000 = pd.DataFrame(load_dataset('XXXXXX/Fino1_Reasoning_FinQA')['train'])

In [None]:
df_all = pd.concat([df_5000,df[['Open-ended Verifiable Question', 'Ground-True Answer', 'Complex_CoT', 'Response']]],axis=0)

In [None]:
df_all = df_all.reset_index(drop=True)

In [None]:
df_all.to_json("combined_reasoning.json", orient="records", lines=True)