# Check Dataset

* Dataset은 PASCALVOC2012.txt 파일에 JPEGImages와 SegmentationClass 하나씩 한줄로 저장되어 있습니다.

In [17]:
# 파일을 읽고 각 줄을 리스트에 저장
with open('PASCALVOC2012.txt', 'r') as file:
    lines = file.readlines()

# 줄바꿈 문자 제거
lines = [line.strip() for line in lines]

# 중복을 확인하기 위해 set으로 변환
unique_lines = set(lines)

# 중복되는 값 확인
duplicates = [line for line in lines if lines.count(line) > 1]

# 결과 출력
total_lines = len(lines)
unique_lines_count = len(unique_lines)
duplicate_count = total_lines - unique_lines_count

print(f"총 데이터 개수: {total_lines}")
print(f"중복 데이터 개수: {duplicate_count}")

if duplicates:
    print("중복되는 값이 있습니다:")
    for dup in set(duplicates):
        print(dup)
else:
    print("중복되는 값이 없습니다.")

총 데이터 개수: 12031
중복 데이터 개수: 0
중복되는 값이 없습니다.


# Split Dataset

* 해당 데이터를 스플릿하는 과정은 다음과 같습니다. (랜덤 샘플링)
1. 10번의 반복 실험 진행을 위해 iter01, iter02, iter03, ... , iter10 순으로 저장됩니다.
2. 각 iter 폴더에는 training, test, validation 폴더가 저장됩니다.
3. training 폴더에는 labeled 폴더와 unlabeled 폴더와 train.txt 파일이, test 폴더에는 test.txt, validation 폴더에는 valid.txt 가 저장됩니다.
4. labeled 폴더에는 레이블 정보를 담는 labeled.txt 파일이, unlabeled 폴더에는 레이블 정보가 없는 unlabeled.txt 파일이 저장됩니다.
5. labeled.txt 파일에는 label ratio (1/4, 1/8, 1/16) 에 맞춰 각 파일이 저장됩니다.

In [27]:
import os
import random

# 기본 설정
iterations = 10
label_ratios = [1/4, 1/8, 1/16]
random_seed = 42

# 전체 데이터 파일 리스트 읽기
with open('PASCALVOC2012.txt', 'r') as file:
    lines = file.readlines()

# 총 데이터 개수
total_data_count = len(lines)

# 비율 설정 (정수로 개수 계산)
train_ratio = 0.6
valid_ratio = 0.2
test_ratio = 0.2

train_count = int(total_data_count * train_ratio)
valid_count = int(total_data_count * valid_ratio)
test_count = total_data_count - train_count - valid_count  # 나머지를 test로 할당

for i in range(1, iterations + 1):
    random.seed(random_seed)  # 시드 고정
    random.shuffle(lines)  # 데이터 셔플

    iter_dir = f'iter{i:02d}'
    
    # 데이터를 train, valid, test로 분할
    train_data = lines[:train_count]
    valid_data = lines[train_count:train_count + valid_count]
    test_data = lines[train_count + valid_count:]
    print(f'{i} : {len(train_data)}')
    print(f'{i} : {len(test_data)}')
    print(f'{i} : {len(valid_data)}')

    # 각 폴더 생성
    os.makedirs(os.path.join(iter_dir, 'training'), exist_ok=True)
    os.makedirs(os.path.join(iter_dir, 'test'), exist_ok=True)
    os.makedirs(os.path.join(iter_dir, 'validation'), exist_ok=True)

    # train.txt, test.txt, valid.txt 생성 후 각 파일에 몇 개의 라인이 기록되었는지 확인
    with open(os.path.join(iter_dir, 'training', 'train.txt'), 'w') as f:
        f.writelines([line if line.endswith('\n') else line + '\n' for line in train_data])
    
    with open(os.path.join(iter_dir, 'test', 'test.txt'), 'w') as f:
        f.writelines([line if line.endswith('\n') else line + '\n' for line in test_data])
    
    with open(os.path.join(iter_dir, 'validation', 'valid.txt'), 'w') as f:
        f.writelines([line if line.endswith('\n') else line + '\n' for line in valid_data])
    
    # 라인 수 확인
    with open(os.path.join(iter_dir, 'training', 'train.txt'), 'r') as f:
        train_lines_written = len(f.readlines())
    with open(os.path.join(iter_dir, 'test', 'test.txt'), 'r') as f:
        test_lines_written = len(f.readlines())
    with open(os.path.join(iter_dir, 'validation', 'valid.txt'), 'r') as f:
        valid_lines_written = len(f.readlines())
    
    print(f'Train file: {train_lines_written} lines')
    print(f'Test file: {test_lines_written} lines')
    print(f'Validation file: {valid_lines_written} lines')

    
    # 라벨 비율에 따른 데이터 분할
    for ratio in label_ratios:
        ratio_name = f'1_{int(1/ratio)}'
        labeled_count = int(train_count * ratio)
        unlabeled_count = train_count - labeled_count
        
        ratio_dir = os.path.join(iter_dir, 'training', ratio_name)
        os.makedirs(os.path.join(ratio_dir, 'labeled'), exist_ok=True)
        os.makedirs(os.path.join(ratio_dir, 'unlabeled'), exist_ok=True)

        labeled_data = train_data[:labeled_count]
        unlabeled_data = train_data[labeled_count:]

        # labeled.txt, unlabeled.txt 생성
        with open(os.path.join(ratio_dir, 'labeled', 'labeled.txt'), 'w') as f:
            f.writelines([line if line.endswith('\n') else line + '\n' for line in labeled_data])
        with open(os.path.join(ratio_dir, 'unlabeled', 'unlabeled.txt'), 'w') as f:
            f.writelines([line if line.endswith('\n') else line + '\n' for line in unlabeled_data])

    # 시드 증가
    random_seed += 1

1 : 7218
1 : 2407
1 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
2 : 7218
2 : 2407
2 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
3 : 7218
3 : 2407
3 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
4 : 7218
4 : 2407
4 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
5 : 7218
5 : 2407
5 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
6 : 7218
6 : 2407
6 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
7 : 7218
7 : 2407
7 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
8 : 7218
8 : 2407
8 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
9 : 7218
9 : 2407
9 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lines
10 : 7218
10 : 2407
10 : 2406
Train file: 7218 lines
Test file: 2407 lines
Validation file: 2406 lin

# Data Verification

* 해당 데이터의 검증 절차는 다음과 같습니다.
1. 각 iter에서 생성된 train.txt, test.txt, valid.txt가 다음 생성되는 train.txt, test.txt, valid.txt와 같은지의 여부를 판별합니다. (데이터 셔플되었는지 확인)
2. 각 iter에서 생성된 train.txt, test.txt, valid.txt의 데이터 개수가 같은지 확인합니다.
3. label ratio대로 구분한 labeled.txt, unlabeled.txt의 개수가 진행되었는지 확인합니다.
4. label ratio대로 구분한 labeled.txt, unlabeled.txt가 같은지의 여부를 판별합니다. (데이터 셔플되었는지 확인)
5. train.txt, test.txt, valid.txt를 합친 모든 데이터의 개수가 기존 데이터 개수(12030개)와 일치하는지 확인합니다.
6. train.txt, test.txt, valid.txt를 합친 데이터에서 중복이 있는지 확인합니다.

In [28]:
import os

# 기본 설정
iterations = 10
label_ratios = [1/4, 1/8, 1/16]

# 각 iteration 및 비율별 데이터 개수 출력
for i in range(1, iterations + 1):
    iter_dir = f'iter{i:02d}'
    
    # Train, Test, Validation 파일의 데이터 개수 출력
    for split in ['train', 'test', 'valid']:
        file_name = f'{split}.txt'
        if split == 'valid':
            file_name = 'valid.txt'  # validation 파일 이름은 valid.txt로 저장됨
        file_path = os.path.join(iter_dir, 'training' if split == 'train' else 'test' if split == 'test' else 'validation', file_name)
        
        with open(file_path, 'r') as f:
            line_count = len(f.readlines())
        print(f'Iteration {i}, {split}: {line_count} lines')
    
    # 라벨 비율에 따른 파일의 데이터 개수 출력
    for ratio in label_ratios:
        ratio_dir = os.path.join(iter_dir, 'training', f'1_{int(1/ratio)}')
        labeled_dir = os.path.join(ratio_dir, 'labeled')
        unlabeled_dir = os.path.join(ratio_dir, 'unlabeled')
        
        labeled_file = os.path.join(labeled_dir, 'labeled.txt')
        unlabeled_file = os.path.join(unlabeled_dir, 'unlabeled.txt')
        
        with open(labeled_file, 'r') as f:
            labeled_count = len(f.readlines())
        
        with open(unlabeled_file, 'r') as f:
            unlabeled_count = len(f.readlines())
        
        print(f'Iteration {i}, Ratio 1_{int(1/ratio)}, Labeled: {labeled_count} lines, Unlabeled: {unlabeled_count} lines')

Iteration 1, train: 7218 lines
Iteration 1, test: 2407 lines
Iteration 1, valid: 2406 lines
Iteration 1, Ratio 1_4, Labeled: 1804 lines, Unlabeled: 5414 lines
Iteration 1, Ratio 1_8, Labeled: 902 lines, Unlabeled: 6316 lines
Iteration 1, Ratio 1_16, Labeled: 451 lines, Unlabeled: 6767 lines
Iteration 2, train: 7218 lines
Iteration 2, test: 2407 lines
Iteration 2, valid: 2406 lines
Iteration 2, Ratio 1_4, Labeled: 1804 lines, Unlabeled: 5414 lines
Iteration 2, Ratio 1_8, Labeled: 902 lines, Unlabeled: 6316 lines
Iteration 2, Ratio 1_16, Labeled: 451 lines, Unlabeled: 6767 lines
Iteration 3, train: 7218 lines
Iteration 3, test: 2407 lines
Iteration 3, valid: 2406 lines
Iteration 3, Ratio 1_4, Labeled: 1804 lines, Unlabeled: 5414 lines
Iteration 3, Ratio 1_8, Labeled: 902 lines, Unlabeled: 6316 lines
Iteration 3, Ratio 1_16, Labeled: 451 lines, Unlabeled: 6767 lines
Iteration 4, train: 7218 lines
Iteration 4, test: 2407 lines
Iteration 4, valid: 2406 lines
Iteration 4, Ratio 1_4, Labeled:

In [30]:
import os
import filecmp

# 기본 설정
iterations = 10
total_data_count = 12031
label_ratios = [1/4, 1/8, 1/16]

# 1. 각 iter에서 생성된 train.txt, test.txt, valid.txt가 다른지 확인
def check_file_difference(iter_dir1, iter_dir2, filenames):
    for filename in filenames:
        file1 = os.path.join(iter_dir1, filename)
        file2 = os.path.join(iter_dir2, filename)
        if filecmp.cmp(file1, file2, shallow=False):
            print(f"Error: {iter_dir1}/{filename} and {iter_dir2}/{filename} are the same.")
        else:
            print(f"{iter_dir1}/{filename} and {iter_dir2}/{filename} are different as expected.")

# 2. 각 iter에서 생성된 train.txt, test.txt, valid.txt의 데이터 개수 확인
def check_file_line_counts(iter_dir, filenames):
    line_counts = {}
    for filename in filenames:
        file_path = os.path.join(iter_dir, filename)
        with open(file_path, 'r') as f:
            line_counts[filename] = len(f.readlines())
    return line_counts

# 3. label ratio대로 구분한 labeled.txt, unlabeled.txt의 데이터가 서로 다른지 확인
def check_labeled_unlabeled_difference(iter_dir1, iter_dir2, ratio_dirs):
    for ratio_dir in ratio_dirs:
        labeled_file1 = os.path.join(ratio_dir.replace(iter_dir2, iter_dir1), 'labeled', 'labeled.txt')
        labeled_file2 = os.path.join(ratio_dir, 'labeled', 'labeled.txt')
        unlabeled_file1 = os.path.join(ratio_dir.replace(iter_dir2, iter_dir1), 'unlabeled', 'unlabeled.txt')
        unlabeled_file2 = os.path.join(ratio_dir, 'unlabeled', 'unlabeled.txt')
        if filecmp.cmp(labeled_file1, labeled_file2, shallow=False):
            print(f"Error: {labeled_file1} and {labeled_file2} are the same.")
        else:
            print(f"{labeled_file1} and {labeled_file2} are different as expected.")
        if filecmp.cmp(unlabeled_file1, unlabeled_file2, shallow=False):
            print(f"Error: {unlabeled_file1} and {unlabeled_file2} are the same.")
        else:
            print(f"{unlabeled_file1} and {unlabeled_file2} are different as expected.")

# 4. train.txt, test.txt, valid.txt를 합친 모든 데이터의 개수가 기존 데이터 개수(12031개)와 일치하는지 확인
def check_total_data_count(iter_dir, filenames):
    total_count = 0
    for filename in filenames:
        file_path = os.path.join(iter_dir, filename)
        with open(file_path, 'r') as f:
            total_count += len(f.readlines())
    if total_count != total_data_count:
        print(f"Error: {iter_dir} total data count is {total_count}, expected {total_data_count}.")
    else:
        print(f"{iter_dir} total data count matches the expected {total_data_count}.")

# 5. train.txt, test.txt, valid.txt를 합친 데이터에서 중복이 있는지 확인
def check_for_duplicates(iter_dir, filenames):
    combined_data = set()
    for filename in filenames:
        file_path = os.path.join(iter_dir, filename)
        with open(file_path, 'r') as f:
            lines = f.readlines()
            if len(set(lines)) != len(lines):
                print(f"Error: {iter_dir}/{filename} contains duplicates.")
            combined_data.update(lines)
    if len(combined_data) != total_data_count:
        print(f"Error: {iter_dir} combined data contains duplicates.")
    else:
        print(f"{iter_dir} combined data contains no duplicates.")

# 검증 시작
train_test_valid_files = ['training/train.txt', 'test/test.txt', 'validation/valid.txt']

for i in range(1, iterations):
    iter_dir1 = f'iter{i:02d}'
    iter_dir2 = f'iter{i+1:02d}'

    # 1. 파일 차이점 확인
    check_file_difference(iter_dir1, iter_dir2, train_test_valid_files)
    
    # 2. 라인 개수 확인
    counts1 = check_file_line_counts(iter_dir1, train_test_valid_files)
    counts2 = check_file_line_counts(iter_dir2, train_test_valid_files)
    if counts1 != counts2:
        print(f"Error: Line counts in {iter_dir1} and {iter_dir2} are different.")
    else:
        print(f"Line counts in {iter_dir1} and {iter_dir2} are the same.")
    
    # 3. 라벨 데이터 셔플 확인 (labeled/unlabeled 파일 간의 데이터 차이점 확인)
    ratio_dirs2 = [os.path.join(iter_dir2, 'training', f'1_{int(1/ratio)}') for ratio in label_ratios]
    check_labeled_unlabeled_difference(iter_dir1, iter_dir2, ratio_dirs2)
    
    # 4. 전체 데이터 개수 확인
    check_total_data_count(iter_dir1, train_test_valid_files)
    check_total_data_count(iter_dir2, train_test_valid_files)
    
    # 5. 중복 확인
    check_for_duplicates(iter_dir1, train_test_valid_files)
    check_for_duplicates(iter_dir2, train_test_valid_files)


iter01/training/train.txt and iter02/training/train.txt are different as expected.
iter01/test/test.txt and iter02/test/test.txt are different as expected.
iter01/validation/valid.txt and iter02/validation/valid.txt are different as expected.
Line counts in iter01 and iter02 are the same.
iter01\training\1_4\labeled\labeled.txt and iter02\training\1_4\labeled\labeled.txt are different as expected.
iter01\training\1_4\unlabeled\unlabeled.txt and iter02\training\1_4\unlabeled\unlabeled.txt are different as expected.
iter01\training\1_8\labeled\labeled.txt and iter02\training\1_8\labeled\labeled.txt are different as expected.
iter01\training\1_8\unlabeled\unlabeled.txt and iter02\training\1_8\unlabeled\unlabeled.txt are different as expected.
iter01\training\1_16\labeled\labeled.txt and iter02\training\1_16\labeled\labeled.txt are different as expected.
iter01\training\1_16\unlabeled\unlabeled.txt and iter02\training\1_16\unlabeled\unlabeled.txt are different as expected.
iter01 total dat