In [3]:
import os
import json
import chardet
import random
from tqdm import tqdm

# 특정 키들이 있는지 확인하는 함수
def has_required_keys(json_obj, required_keys):
    return all(key in json_obj for key in required_keys)

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

def merge_and_shuffle_jsonl_files(input_folder, output_file, required_keys, max_entries):
    # 모든 파일 경로를 리스트에 저장
    jsonl_files = []
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.jsonl'):
                jsonl_files.append(os.path.join(root, file))
    
    # 데이터를 임시로 저장할 리스트
    data = []

    # tqdm을 사용하여 진행 상태 표시
    for file_path in tqdm(jsonl_files, desc="Processing JSONL files"):
        try:
            encoding = detect_encoding(file_path)
            with open(file_path, 'r', encoding=encoding) as infile:
                for line in infile:
                    try:
                        json_obj = json.loads(line.strip())
                        data.append(json_obj)
                        # if has_required_keys(json_obj, required_keys):
                        #     data.append(json_obj)
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON from file {file_path}: {line.strip()}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")

    # 데이터를 셔플링
    random.shuffle(data)

    # 최대 항목 수 제한
    # data = data[:max_entries]

    # 출력 파일에 쓰기
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for entry in data:
            outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')

# 사용 예시
input_folder = 'dataset/02. Ko-MMLU'  # 순회할 폴더 경로를 입력하세요.
output_file = 'dataset_merge/merged_output_Ko-MMLU.jsonl'  # 병합된 결과를 저장할 파일명을 입력하세요.
required_keys = ['instruction', 'input', 'output', 'system', 'text']  # 확인할 키들을 입력하세요.
max_entries = 50000  # 저장할 최대 항목 수

# 함수 호출


In [4]:
merge_and_shuffle_jsonl_files(input_folder, output_file, required_keys, max_entries)

Processing JSONL files: 100%|██████████| 15/15 [26:49<00:00, 107.28s/it]


In [3]:
# merge_jsonl_files(input_folder, output_file, required_keys)

Processing JSONL files:  70%|██████▉   | 46/66 [29:08<1:47:37, 322.89s/it]

Error reading file dataset/06. Ko-GSM8k/01. 숫자 연산 기계 독해/descriptive/01.경제/Training/경제.jsonl: 'charmap' codec can't decode byte 0x98 in position 110: character maps to <undefined>


Processing JSONL files:  76%|███████▌  | 50/66 [34:05<44:05, 165.37s/it]  

Error reading file dataset/02. Ko-MMLU/04. 뉴스기사 기계독해/descriptive/span_inference/Training/span_inference.jsonl: 'utf-8' codec can't decode byte 0xec in position 188: invalid continuation byte


Processing JSONL files:  79%|███████▉  | 52/66 [44:11<1:01:06, 261.89s/it]

Error reading file dataset/02. Ko-MMLU/04. 뉴스기사 기계독해/descriptive/text_entailment/Training/text_entailment.jsonl: 'utf-8' codec can't decode byte 0xb0 in position 4278: invalid start byte


Processing JSONL files:  82%|████████▏ | 54/66 [1:17:56<2:21:32, 707.74s/it]

Error reading file dataset/02. Ko-MMLU/04. 뉴스기사 기계독해/descriptive/span_extraction/Training/span_extraction.jsonl: 'charmap' codec can't decode byte 0x98 in position 43: character maps to <undefined>


Processing JSONL files: 100%|██████████| 66/66 [1:39:33<00:00, 90.51s/it]   


: 

In [14]:
import json
from tqdm import tqdm

def extract_first_n_items(input_file, output_file, n):
    count = 0
    with open(input_file, 'r', encoding='utf-8') as infile:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for line in tqdm(infile, desc=f"Extracting {n} items", total=n):
                if count >= n:
                    break
                try:
                    json_obj = json.loads(line.strip())
                    json.dump(json_obj, outfile, ensure_ascii=False)
                    outfile.write('\n')
                    count += 1
                except json.JSONDecodeError:
                    print(f"Error decoding JSON from line: {line.strip()}")

# 입력과 출력 파일 경로 설정
input_file = 'dataset_merge/merged_output_50000.jsonl'
output_file = 'dataset_merge/merged_output_30000.jsonl'
num_items_to_extract = 30000

# 함수 호출
extract_first_n_items(input_file, output_file, num_items_to_extract)


Extracting 30000 items: 100%|██████████| 30000/30000 [00:01<00:00, 25232.95it/s]


: 