In [10]:
import os
import json
import chardet
from tqdm import tqdm

def has_required_keys(json_obj, required_keys):
    return all(key in json_obj for key in required_keys)

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

def calculate_average_length(texts):
    total_length = sum(len(text) for text in texts)
    return total_length / len(texts) if texts else 0

def process_jsonl_files(input_folder, required_keys):
    # 모든 파일 경로를 리스트에 저장
    jsonl_files = []
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.jsonl'):
                jsonl_files.append(os.path.join(root, file))
    
    # tqdm을 사용하여 진행 상태 표시
    for file_path in tqdm(jsonl_files, desc="Processing JSONL files"):
        try:
            encoding = detect_encoding(file_path)
            texts = []
            with open(file_path, 'r', encoding=encoding) as infile:
                for line in infile:
                    try:
                        json_obj = json.loads(line.strip())
                        if has_required_keys(json_obj, required_keys):
                            texts.append(json_obj["text"])
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON from file {file_path}: {line.strip()}")
            average_length = calculate_average_length(texts)
            print(f"Average length of 'text' in {file_path}: {average_length}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")

# 사용 예시
input_folder = 'dt'  # 순회할 폴더 경로를 입력하세요.
required_keys = ['text']  # 필요한 키를 입력하세요.

process_jsonl_files(input_folder, required_keys)


Processing JSONL files: 100%|██████████| 1/1 [00:07<00:00,  7.11s/it]

Average length of 'text' in dt/merged_output_test.jsonl: 119.89378994165045





In [6]:
from datasets import load_dataset

def calculate_average_length(texts):
    total_length = sum(len(text) for text in texts)
    return total_length / len(texts) if texts else 0

def get_average_text_length(dataset_name, split='train'):
    # 허깅페이스에서 데이터셋 불러오기
    dataset = load_dataset(dataset_name, split=split)

    # "text" 칼럼 추출
    inst = dataset['instruction']
    inp = dataset['input']
    out = dataset['output']

    # 평균 길이 계산
    average_inst = calculate_average_length(inst)
    average_inp = calculate_average_length(inst)
    average_out = calculate_average_length(inst)
    
    average = (average_inst + average_inp + average_out) // 3

    print(f"Average length of 'text' in {dataset_name} ({split} split): {average}")

# 사용 예시
dataset_name = 'Bingsu/ko_alpaca_data'  # 데이터셋 이름을 입력하세요 (예: 'ag_news', 'imdb', 등)
split = 'train'  # 사용할 데이터셋 분할 (예: 'train', 'test', 등)

get_average_text_length(dataset_name, split)


Average length of 'text' in Bingsu/ko_alpaca_data (train split): 29.0
