# cosmosaq

In [2]:
# 转换为 ChatML 格式
import os
import shutil
import json
import pandas as pd
input_dir = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/cosmosqa/data"
output_dir = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/cosmosqa"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)


# 处理每个数据集
for dataset_name in ["train.csv", "valid.csv"]:
    input_path = os.path.join(input_dir, dataset_name)
    output_path = os.path.join(output_dir, dataset_name.replace('.csv', '.json'))

    # 加载数据
    data = pd.read_csv(input_path)

    # 数据预处理
    data.dropna(inplace=True)  # 去除可能存在的空行

    data_out_list = []
    for _, row in data.iterrows():
        input_text = f"This is a reading comprehension task. Given the context: {row['context']} Question: {row['question']} " \
                     f"Which of the following is the correct answer related to the context? 0: {row['answer0']} 1: {row['answer1']} " \
                     f"2: {row['answer2']} 3: {row['answer3']}"
        output_text = str(row['label'])  # 保留数字标签
        data_out = {
            "messages": [
                {
                    "role": "user",
                    "content": input_text,
                },
                {
                    "role": "assistant",
                    "content": output_text,
                },
            ]
        }
        data_out_list.append(data_out)

    # 保存处理后的数据到 JSON 文件
    with open(output_path, 'w') as fo:
        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)

    print(f"{dataset_name} processed and saved.")

train.csv processed and saved.
valid.csv processed and saved.


In [6]:
import pandas as pd
import json

# 读取JSONL文件
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [json.loads(line) for line in lines]

# 转换为DataFrame
def create_dataframe(data):
    df = pd.DataFrame(data)
    return df

# 格式化query
def format_query(row):
    return f"<用户>This is a reading comprehension task. Given the context: {row['context']} Question: {row['question']} " \
           f"Which of the following is the correct answer related to the context? Please Only answer the question with a single number. 0: {row['answer0']} 1: {row['answer1']} " \
           f"2: {row['answer2']} 3: {row['answer3']}<AI>"

# 加载数据
file_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/cosmosqa/data/test.jsonl'  # 更改为您的JSONL文件路径
data = read_jsonl(file_path)

# 生成query字典列表
query_list = [format_query(row) for row in data]

# 保存为JSON文件
json_file_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/cosmosqa/test.json'  # 输出JSON文件的路径
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(query_list, json_file, ensure_ascii=False, indent=4)
print("JSON file has been created successfully.")

JSON file has been created successfully.


# Trivia QA

In [19]:
import os

# 文件夹路径
directory_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa'

# 列出文件夹中所有文件
files = os.listdir(directory_path)

# 过滤出不包含 'without' 的文件
filtered_files = [file for file in files if 'without' not in file and 'verified' not in file]

# 打印筛选后的文件列表
print(filtered_files)

['web-train.json', 'wikipedia-dev.json', 'web-dev.json', 'wikipedia-train.json']


In [20]:
import json
import os
from tqdm import tqdm  # 引入tqdm

# 定义读取和处理每个文件的函数
def process_web_json_file(file_path, evidence_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        data_out_list = []

        # 使用tqdm显示处理进度
        for entry in tqdm(data.get("Data", []), desc="Processing entries"):
            input_text = f"This is a reading comprehension task. Given the context:\n"
            output_text = entry["Answer"]["Value"]
            question = entry["Question"]
            if "SearchResults" in entry:
                # 处理每个SearchResult
                for index, result in enumerate(entry["SearchResults"]):
                    filename = result.get("Filename")
                    e_path = os.path.join(evidence_path, filename)  
                    with open(e_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                        content = f"Reading Material {index}: {content}\n"
                        input_text += content
            
            data_out = {
                "messages": [
                    {
                        "role": "user",
                        "content": input_text,
                        "question": question,
                    },
                    {
                        "role": "assistant",
                        "content": output_text,
                    },
                ]
            }
            data_out_list.append(data_out)

    output_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/triviaqa-rc'
    dataset_name = 'web-train' if 'train' in file_path else 'web-dev'
    output_file_path = os.path.join(output_path, f'{dataset_name}.json')

    # 保存处理后的数据到 JSON 文件
    with open(output_file_path, 'w', encoding='utf-8') as fo:
        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)

    print(f"{dataset_name} processed and saved.")



In [9]:
import json
import os
from tqdm import tqdm  # 引入tqdm
import sys
def adjust_filename_lower(filename):
    """ 修改文件名策略：
        - 如果有下划线，将最后一个下划线后的第一个字母转换为小写。
        - 如果没有下划线，将第一个字母保持大写，其余转换为小写。
    """
    parts = filename.rsplit('_', 1)
    if len(parts) == 2:
        part1, part2 = parts
        if part2:  # 确保后面有字符
            part2 = part2[0].lower() + part2[1:]  # 只改变第一个字符
            return '_'.join([part1, part2])
    else:
        # 没有下划线时的处理：第一个字母大写，其余小写
        return filename[0].upper() + filename[1:].lower() if filename else filename

def adjust_filename_upper(filename):
    """ 修改文件名策略：
        - 如果有下划线，将最后一个下划线后的第一个字母转换为小写。
        - 如果没有下划线，将第一个字母保持大写，其余转换为小写。
    """
    parts = filename.rsplit('_', 1)
    if len(parts) == 2:
        part1, part2 = parts
        if part2:  # 确保后面有字符
            part2 = part2[0].upper() + part2[1:]  # 只改变第一个字符
            return '_'.join([part1, part2])
    else:
        # 没有下划线时的处理：第一个字母大写，其余小写
        return filename[0].upper() + filename[1:].lower() if filename else filename

# 定义读取和处理每个文件的函数
def process_wikipedia_json_file(file_path, evidence_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        data_out_list = []

        # 使用tqdm显示处理进度
        for entry in tqdm(data.get("Data", []), desc="Processing entries"):
            input_text = f"This is a reading comprehension task. Given the context:\n"
            output_text = entry["Answer"]["Value"]
            question = entry["Question"]
            # if "EntityPages" in entry:
            #     # 处理每个SearchResult
            #     for index, result in enumerate(entry["EntityPages"]):
            #         filename = result.get("Filename")
            #         e_path = os.path.join(evidence_path, filename)  
            #         with open(e_path, 'r', encoding='utf-8') as file:
            #             content = file.read()
            #             content = f"Reading Material {index}: {content}\n"
            #             input_text += content
            if "EntityPages" in entry:
                for index, result in tqdm(enumerate(entry["EntityPages"]), desc=f"Processing files for question: {entry['QuestionId']}", leave=False):
                    filename = result.get("Filename")
                    e_path = os.path.join(evidence_path, filename)

                    try:
                        with open(e_path, 'r', encoding='utf-8') as file:
                            content = file.read()
                    except FileNotFoundError:
                        # 尝试调整文件名并重新打开
                        new_filename = adjust_filename_lower(filename)
                        new_e_path = os.path.join(evidence_path, new_filename)
                        try:
                            with open(new_e_path, 'r', encoding='utf-8') as file:
                                content = file.read()
                        except FileNotFoundError:
                            new_filename = adjust_filename_upper(filename)
                            new_e_path = os.path.join(evidence_path, new_filename)
                            try:
                                with open(new_e_path, 'r', encoding='utf-8') as file:
                                    content = file.read()
                            except FileNotFoundError:
                            # 如果调整后仍然找不到文件，则抛出异常并中断处理
                                raise FileNotFoundError(f"Unable to find file after adjustment: {new_filename} in {e_path}")

                    content = f"Reading Material {index}: {content}\n"
                    input_text += content
            
            data_out = {
                "messages": [
                    {
                        "role": "user",
                        "content": input_text,
                        "question": question,
                    },
                    {
                        "role": "assistant",
                        "content": output_text,
                    },
                ]
            }
            
            data_out_list.append(data_out)

    output_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/triviaqa-rc'
    dataset_name = 'wikipedia-train' if 'train' in file_path else 'wikipedia-dev'
    output_file_path = os.path.join(output_path, f'{dataset_name}.json')

    # 保存处理后的数据到 JSON 文件
    with open(output_file_path, 'w', encoding='utf-8') as fo:
        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)

    print(f"{dataset_name} processed and saved.")



In [22]:
# 对筛选后的文件进行处理
filtered_files = ['wikipedia-train.json']
for filename in filtered_files:
    file_path = os.path.join(directory_path, filename)
    print(file_path)
    # if 'web' in file_path:
    #     evidence_path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/evidence/web"
    #     process_web_json_file(file_path,evidence_path)
    if 'wikipedia' in file_path:
        evidence_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/evidence/wikipedia'
        process_wikipedia_json_file(file_path,evidence_path)
    

/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa/wikipedia-train.json


Processing entries: 100%|██████████| 61888/61888 [06:40<00:00, 154.41it/s]


wikipedia-train processed and saved.


In [23]:
# filtered_files = ['web-train.json', 'web-dev.json']
filtered_files = ['web-train.json']
# 对筛选后的文件进行处理
for filename in filtered_files:
    file_path = os.path.join(directory_path, filename)
    print(file_path)
    if 'web' in file_path:
        evidence_path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/evidence/web"
        process_web_json_file(file_path,evidence_path)
    # if 'wikipedia' in file_path:
    #     evidence_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/evidence/wikipedia'
    #     process_wikipedia_json_file(file_path,evidence_path)
    

/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa/web-train.json


Processing entries: 100%|██████████| 76496/76496 [18:35<00:00, 68.57it/s]  


web-train processed and saved.


In [14]:
import os

# 文件夹路径
directory_path = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa'

# 列出文件夹中所有文件
files = os.listdir(directory_path)

# 过滤出不包含 'without' 的文件
filtered_files = [file for file in files if 'dev' in file and  'verified' in file]

# 打印筛选后的文件列表
print(filtered_files)

['verified-web-dev.json', 'verified-wikipedia-dev.json']


In [29]:
from utils import dataset_utils
from utils import utils
import json
from tqdm import tqdm
import sys
def process(file_path, processed_path, evidence_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        data_out_list = []
    
    for entry in tqdm(data.get("Data", []), desc="Processing entries"):
        question = entry["Question"]
        question_id = entry["QuestionId"]
        question = f"This is a reading comprehension task. After reading the given content, answer me the question: \n{question} \n"
        input_content = ''


        if data['Domain'] == 'Wikipedia':
            question_label = question_id
            for index, page in enumerate(entry.get('EntityPages', []) + entry.get('SearchResults', [])):
                filename = page.get("Filename")
                e_path = os.path.join(evidence_path, 'wikipedia',filename)  
                with open(e_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    content = f"Reading Material {index}: {content}\n"
                    input_content += content
            data_out = {
                "question_label":question_label,
                "question":question + '\n' + input_content,
            }
                
            data_out_list.append(data_out)
                        
        elif data['Domain'] == 'Web':
            for index, page in enumerate(entry.get('SearchResults', [])):
                filename = page.get("Filename")
                question_label = '{}--{}'.format(question_id, filename)
                e_path = os.path.join(evidence_path, 'web' ,filename)
                try:
                    with open(e_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                except FileNotFoundError:
                    # 尝试调整文件名并重新打开
                    new_filename = adjust_filename_lower(filename)
                    new_e_path = os.path.join(evidence_path, new_filename)
                    try:
                        with open(new_e_path, 'r', encoding='utf-8') as file:
                            content = file.read()
                    except FileNotFoundError:
                        new_filename = adjust_filename_upper(filename)
                        new_e_path = os.path.join(evidence_path, new_filename)
                        try:
                            with open(new_e_path, 'r', encoding='utf-8') as file:
                                content = file.read()
                        except FileNotFoundError:
                            continue
                content = f"Reading Material {index}: {content}\n"
                
                data_out = {
                    "question_label":question_label,
                    "question":question + '\n' + content,
                }
                data_out_list.append(data_out)

            for index, page in enumerate(entry.get('EntityPages', [])):
                filename = page.get("Filename")
                question_label = '{}--{}'.format(question_id, filename)
                e_path = os.path.join(evidence_path, 'web' ,filename)
                try:
                    with open(e_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                except FileNotFoundError:
                    # 尝试调整文件名并重新打开
                    new_filename = adjust_filename_lower(filename)
                    new_e_path = os.path.join(evidence_path, new_filename)
                    try:
                        with open(new_e_path, 'r', encoding='utf-8') as file:
                            content = file.read()
                    except FileNotFoundError:
                        new_filename = adjust_filename_upper(filename)
                        new_e_path = os.path.join(evidence_path, new_filename)
                        try:
                            with open(new_e_path, 'r', encoding='utf-8') as file:
                                content = file.read()
                        except FileNotFoundError:
                            continue
                
                content = f"Reading Material {index}: {content}\n"
                print(content)
                data_out = {
                    "question_label":question_label,
                    "question":question + '\n' + content,
                }
                data_out_list.append(data_out)

    dataset_name = 'verified-wikipedia-dev' if data['Domain'] == 'Wikipedia' else 'verified-web-dev'
    output_file_path = os.path.join(processed_path, f'{dataset_name}.json')

    # 保存处理后的数据到 JSON 文件
    with open(output_file_path, 'w', encoding='utf-8') as fo:
        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)

    print(f"{dataset_name} processed and saved.")


In [30]:
path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa"
p_path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/triviaqa-rc"
e_path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/evidence"
for p in (filtered_files):
    file_path = os.path.join(path,p)
    print(file_path)
    process(file_path, p_path,e_path)

/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa/verified-web-dev.json


Processing entries:   0%|          | 0/407 [00:00<?, ?it/s]

Processing entries: 100%|██████████| 407/407 [00:00<00:00, 17718.98it/s]


verified-web-dev processed and saved.
/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa/verified-wikipedia-dev.json


Processing entries: 100%|██████████| 318/318 [00:00<00:00, 6428.49it/s]


verified-wikipedia-dev processed and saved.


In [35]:
from utils import dataset_utils
from utils import utils
import json
from tqdm import tqdm
import sys
def process(file_path, processed_path, evidence_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    data_out_list = {}
    
    for entry in tqdm(data.get("Data", []), desc="Processing entries"):
        question = entry["Question"]
        question_id = entry["QuestionId"]
        question = f"This is a reading comprehension task. After reading the given content, answer me the question: \n{question} \n"
        input_content = ''
        answer = entry['Answer']['Value']

        if data['Domain'] == 'Wikipedia':
            question_label = question_id
            # data_out = {
            #    question_label: answer
            # }
                
            data_out_list[question_label] = answer
                        
        elif data['Domain'] == 'Web':
            for index, page in enumerate(entry.get('SearchResults', [])):
                filename = page.get("Filename")
                question_label = '{}--{}'.format(question_id, filename)
            
                # data_out = {
                #     question_label: answer
                # }
                data_out_list[question_label] = answer

            for index, page in enumerate(entry.get('EntityPages', [])):
                filename = page.get("Filename")
                question_label = '{}--{}'.format(question_id, filename)
                
                # data_out = {
                #    question_label: answer
                # }
                data_out_list[question_label] = answer

    dataset_name = 'verified-wikipedia-result' if data['Domain'] == 'Wikipedia' else 'verified-web-result'
    output_file_path = os.path.join(processed_path, f'{dataset_name}.json')

    # 保存处理后的数据到 JSON 文件
    with open(output_file_path, 'w', encoding='utf-8') as fo:
        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)

    print(f"{dataset_name} processed and saved.")


In [37]:
path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa"
p_path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/triviaqa-rc"
e_path = "/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/evidence"
for p in (filtered_files):
    file_path = os.path.join(path,p)
    print(file_path)
    process(file_path, p_path,e_path)

/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa/verified-web-dev.json


Processing entries: 100%|██████████| 407/407 [00:00<00:00, 405964.74it/s]


verified-web-result processed and saved.
/home/mnt/wyx/src/Finetune-MiniCPM/datasets/raw/triviaqa-rc/qa/verified-wikipedia-dev.json


Processing entries: 100%|██████████| 318/318 [00:00<00:00, 853351.68it/s]

verified-wikipedia-result processed and saved.





{'tc_1348': {'Aliases': ['Baby Buggy', 'Baby buggy'], 'HumanAnswers': ['lightweight baby buggy with a collapsible support assembly'], 'MatchedWikiEntityName': 'Baby Buggy', 'NormalizedAliases': ['baby buggy'], 'NormalizedMatchedWikiEntityName': 'baby buggy', 'NormalizedValue': 'baby buggy', 'Type': 'WikipediaEntity', 'Value': 'Baby Buggy'}, 'tc_2090': {'Aliases': ['Iwerks, Ub', 'Ub Iwerks', 'Ub Iwerks Studio', 'Celebrity Productions', 'Ubbe Ert Iwwerks'], 'HumanAnswers': ['Ub Iwerks'], 'MatchedWikiEntityName': 'Ub Iwerks', 'NormalizedAliases': ['celebrity productions', 'iwerks ub', 'ub iwerks', 'ubbe ert iwwerks', 'ub iwerks studio'], 'NormalizedMatchedWikiEntityName': 'ub iwerks', 'NormalizedValue': 'ub iwerks', 'Type': 'WikipediaEntity', 'Value': 'Ub Iwerks'}, 'tc_2580': {'Aliases': ['ISO 3166-1:CH', 'Svissland', 'Etymology of Switzerland', 'Confederation Helvetia', 'Swizerland', 'Confederatio Helvetica', 'Environmental Integrity Group', 'Confoederatio Helvetica', 'Svizra', 'SWITZERL

In [6]:
import json
import os
from tqdm import tqdm  # 引入tqdm

# Define the file directory prefix
file_prefix = '/home/mnt/wyx/src/Finetune-MiniCPM/datasets/processed/triviaqa-rc'

# List of file names to process
file_names = [
    'web-dev.json', 'wikipedia-dev.json',
    'web-train.json', 'wikipedia-train.json'
]

def modify_json_structure(file_path):
    # Read the original JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Initialize a new data list
    new_data_list = []

    # Modify the order of 'question' and 'content' in 'user' role messages
    for entry in tqdm(data, desc=f"Processing {os.path.basename(file_path)}"):
        new_entry = entry.copy()  # Make a shallow copy of the entry
        new_messages = []
        for message in entry['messages']:
            if message['role'] == 'user':
                # Create a new dictionary with 'question' before 'content'
                new_message = {
                    "role": "user",
                    "question": message['question'],
                    "content": message['content']
                }
            else:
                new_message = message.copy()  # Copy the assistant's message
            new_messages.append(new_message)
        new_entry['messages'] = new_messages
        new_data_list.append(new_entry)

    # Define new file path
    new_file_path = file_path.replace('.json', '_new.json')
    
    # Write the modified data to a new JSON file
    with open(new_file_path, 'w', encoding='utf-8') as file:
        json.dump(new_data_list, file, indent=4)

# Process each file
for file_name in file_names:
    file_path = os.path.join(file_prefix, file_name)
    modify_json_structure(file_path)
    print(f'Processed and saved new file for {file_name}')


Processing web-dev.json: 100%|██████████| 9951/9951 [00:00<00:00, 83822.73it/s]


Processed and saved new file for web-dev.json


Processing wikipedia-dev.json: 100%|██████████| 7993/7993 [00:00<00:00, 473604.93it/s]


Processed and saved new file for wikipedia-dev.json


Processing web-train.json: 100%|██████████| 76496/76496 [00:00<00:00, 246535.13it/s]


Processed and saved new file for web-train.json


Processing wikipedia-train.json: 100%|██████████| 61888/61888 [00:00<00:00, 218792.43it/s]


Processed and saved new file for wikipedia-train.json
