In [1]:
import pandas as pd
import ast

label_df = pd.read_csv("./data/scin_labels.csv")

unique_labels = set()
for raw in label_df['dermatologist_skin_condition_on_label_name'].dropna():
    try:
        lst = ast.literal_eval(raw)
        if isinstance(lst, list):
            unique_labels.update(lst)
    except (ValueError, SyntaxError):
        continue

print(f"총 {len(unique_labels)}개의 원본 라벨:")
# for lab in sorted(unique_labels):
#     print(" ", lab)

unique_labels_sorted = sorted(unique_labels)

총 370개의 원본 라벨:


In [2]:
import os
import shutil

tgt_label = ["접촉피부염", "두드러기", "백선증", "건선", "정상피부"]

FILE_LABEL = "_".join(tgt_label)
OUTPUT_DIR = f"./data/{FILE_LABEL}"

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)

In [3]:
GROUP_MAP = {
    "Allergic Contact Dermatitis":              "접촉피부염",
    "Irritant Contact Dermatitis":              "접촉피부염",
    "Contact dermatitis caused by Rhus diversiloba": "접촉피부염",
    "Contact dermatitis, NOS":                  "접촉피부염",
    "Photocontact dermatitis [berloque dermatitis]": "접촉피부염",
    "Berloque dermatitis":                      "접촉피부염",
    "Urticaria":                                 "두드러기",
    "Urticarial vasculitis":                     "두드러기",
    "Pruritic urticarial papules and plaques of pregnancy": "두드러기",
    "Tinea" : "백선증",
    "Tinea Versicolor" : "백선증",
    "Psoriasis" : "건선",
    "Psoriasiform dermatitis" : "건선",
    "Inverse psoriasis" : "건선"
}

In [4]:
# atopic = [i for i in orig_atopic if i not in GROUP_MAP['접촉피부염']]


# GROUP_MAP["아토피"]= [i for i in atopic]
# for i in atopic:
#     GROUP_MAP[i] = "아토피"
# GROUP_MAP

# 원본
# GROUP_MAP = {}

# # (A) 아토피 계열
# for key in unique_labels_sorted:
#     if any(sub in key for sub in ["Atopic Dermatitis", "Eczema", "dermatitis"]):
#         # “Contact Dermatitis” 제외
#         if "Contact Dermatitis" not in key:
#             GROUP_MAP[key] = "아토피"

# # (B) 접촉성 피부염 계열
# for key in unique_labels_sorted:
#     if "Contact Dermatitis" in key or "Contact dermatitis" in key:
#         GROUP_MAP[key] = "접촉성 피부염"

# # (C) 건선 계열
# for key in unique_labels_sorted:
#     if "Psoriasis" in key or "psoria" in key.lower():
#         GROUP_MAP[key] = "건선"

# # (D) 태선화 계열
# for key in unique_labels_sorted:
#     if "Lichen planus" in key or "Lichen Simplex Chronicus" in key or "lichen" in key.lower():
#         GROUP_MAP[key] = "태선화"

# # (E) 기타: 위 네 그룹에 속하지 않는 라벨
# for key in unique_labels_sorted:
#     if key not in GROUP_MAP:
#         GROUP_MAP[key] = "기타"

# # (F) 정상피부: handled separately when parsed_labels is empty

In [5]:
from collections import Counter

group_counts = Counter(GROUP_MAP.values())
print("그룹별 원본 라벨 개수:", group_counts)

import pandas as pd

df_counts = pd.Series(GROUP_MAP).value_counts().rename_axis('Group').reset_index(name='Label Count')
print(df_counts)

그룹별 원본 라벨 개수: Counter({'접촉피부염': 6, '두드러기': 3, '건선': 3, '백선증': 2})
   Group  Label Count
0  접촉피부염            6
1   두드러기            3
2     건선            3
3    백선증            2


In [6]:
import pandas as pd
import ast
from collections import defaultdict

# 1) 파일 경로 설정
INPUT_LABEL_PATH  = "./data/scin_labels.csv"        # 원본 SCIN 라벨 CSV 경로
INPUT_CASE_PATH   = "./data/scin_cases.csv"         # SCIN 케이스 메타데이터 경로
OUTPUT_CSV_PATH       = f"{OUTPUT_DIR}/{FILE_LABEL}.csv"  # 멀티클래스 레이블 저장 경로

# 2) 데이터 로드
labels_df = pd.read_csv(INPUT_LABEL_PATH)
cases_df  = pd.read_csv(INPUT_CASE_PATH)[[
    "case_id",
    "related_category",
    "image_1_path",
    "image_2_path",
    "image_3_path"
]]

# 3) 원본 레이블 리스트 파싱
def parse_labels(raw):
    try:
        labels = ast.literal_eval(raw)
        return labels if isinstance(labels, list) else []
    except:
        return []

labels_df["parsed_labels"] = labels_df["dermatologist_skin_condition_on_label_name"].apply(parse_labels)

# 4) 고유 레이블 추출 및 그룹맵 생성
unique_labels = set(l for labs in labels_df["parsed_labels"] for l in labs)

# 5) 가중 레이블 dict 파싱 및 그룹 점수 합산 함수
def parse_weighted(raw):
    try:
        return ast.literal_eval(raw)
    except:
        return {}

def aggregate_group_scores(label_dict):
    scores = defaultdict(float)
    for lab, prob in label_dict.items():
        grp = GROUP_MAP.get(lab, "기타")
        scores[grp] += prob
    return scores

labels_df["parsed_dict"]  = labels_df["weighted_skin_condition_label"].apply(parse_weighted)
labels_df["group_scores"] = labels_df["parsed_dict"].apply(aggregate_group_scores)

# 6) 케이스와 라벨 병합
merged = labels_df.merge(cases_df, on="case_id", how="left")

# 8) 스코어 기반 클래스 결정
# LOOKS_HEALTHY면 '정상피부', 그렇지 않으면 최대 점수 그룹으로 분류

def pick_final_label(row):
    scores = row['group_scores']
    if not scores:
        return '기타'
    grp, _ = max(scores.items(), key=lambda x: x[1])
    return grp

merged['final_label'] = merged.apply(pick_final_label, axis=1)

# 9) 이미지 경로 정리 및 Long format 변환
df = merged[['case_id', 'image_1_path', 'image_2_path', 'image_3_path', 'final_label']]
long_df = pd.melt(
    df,
    id_vars=['case_id', 'final_label'],
    value_vars=['image_1_path', 'image_2_path', 'image_3_path'],
    var_name='image_type',
    value_name='image_path'
)
final_df = long_df.dropna(subset=['image_path']).reset_index(drop=True)
final_df['image_number'] = final_df.index.map(lambda x: f"{x:05d}.png")
final_df['image_path'] = final_df['image_path'].apply(lambda x: x.split('/')[-1])

# 10) 결과 저장
final_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Saved {len(final_df)} samples with final_label to {OUTPUT_CSV_PATH}")

final_df['final_label'].value_counts()

Saved 10407 samples with final_label to ./data/접촉피부염_두드러기_백선증_건선_정상피부/접촉피부염_두드러기_백선증_건선_정상피부.csv


final_label
기타       8932
접촉피부염     656
두드러기      401
백선증       254
건선        164
Name: count, dtype: int64

In [7]:
# if os.path.exists('./data/images'):
#     shutil.rmtree('./data/images')
# os.makedirs("./data/images", exist_ok=True)

# for i in os.listdir("./data/images_backup"):
#     shutil.copy(f'./data/images_backup/{i}', f"./data/images/{i}")
# print("Finish Copy!")

# err_cnt = 0
# for i in os.listdir('./data/images'):
#     try:
#         os.rename(f"./data/images/{i}", os.path.join("./data/images", final_df[final_df['image_path'] == i]['image_number'].values[0]))
#     except:
#         err_cnt += 1
# print(f"Finish Rename, ERR COUNT: {err_cnt}")

In [8]:
import os
import pandas as pd
import shutil
import unicodedata

final_df = pd.read_csv(f'{OUTPUT_DIR}/{FILE_LABEL}.csv')

next_number = len(os.listdir("./data/images"))

# 빈 리스트로 row 누적
new_rows = []

# synthesis 데이터 처리
synthesis_list = os.listdir("./data/synthesis/")
for i in synthesis_list:
    syn_dir = os.path.join("./data/synthesis/", i)

    for j in os.listdir(syn_dir):
        # 새 행 저장
        new_rows.append({
            'image_number': str(next_number).zfill(5) + ".png",
            'final_label': i.split("_")[-1]
        })

        # 이미지 복사
        src = os.path.join(syn_dir, j)
        dst = os.path.join("./data/images", str(next_number).zfill(5) + ".png")
        shutil.copy(src, dst)

        next_number += 1

# 누적된 행을 DataFrame으로 변환 후 concat
new_df = pd.DataFrame(new_rows)
final_df = pd.concat([final_df, new_df], ignore_index=True)

df = final_df.copy(deep=True)
df = df.drop(columns=["image_path"])
df = df.rename(columns={'image_number': 'image_path'})
df = df[['image_path', 'final_label']]
df['final_label'] = df['final_label'].apply(lambda x: unicodedata.normalize("NFC", str(x)).strip())
df.rename(columns={"final_label":"label"}, inplace=True)

df.to_csv(f"{OUTPUT_DIR}/{FILE_LABEL}.csv", index=False)

print("Image Length: ", len(os.listdir("./data/images")))
print("CSV Length: ", len(pd.read_csv(f"{OUTPUT_DIR}/{FILE_LABEL}.csv")))

Image Length:  15155
CSV Length:  11231


In [9]:
import random
random.seed(42)

df = pd.read_csv(f"{OUTPUT_DIR}/{FILE_LABEL}.csv")

OUTPUT_IMAGE_PATH = f'{OUTPUT_DIR}/full_images'
if os.path.exists(OUTPUT_IMAGE_PATH):
    shutil.rmtree(OUTPUT_IMAGE_PATH)
os.makedirs(OUTPUT_IMAGE_PATH, exist_ok=True)

df = df[df['label'].isin(tgt_label)]
df

Unnamed: 0,image_path,label
6,00006.png,두드러기
11,00011.png,두드러기
15,00015.png,백선증
18,00018.png,백선증
21,00021.png,두드러기
...,...,...
11082,15034.png,정상피부
11083,15035.png,정상피부
11084,15036.png,정상피부
11085,15037.png,정상피부


In [10]:
df['label'].value_counts()

label
접촉피부염    656
정상피부     583
두드러기     401
건선       261
백선증      254
Name: count, dtype: int64

In [None]:
import os
import shutil
from PIL import Image, UnidentifiedImageError

# df: image_path 컬럼을 가진 DataFrame
# OUTPUT_IMAGE_PATH, OUTPUT_DIR, FILE_LABEL 은 기존 정의 그대로

for i in df['image_path']:
    src = os.path.join('./data/images', i.strip())
    try:
        with Image.open(src) as img:
            img.verify()   # 실제 이미지인지 한번 더 검증
        dst = os.path.join(OUTPUT_IMAGE_PATH, i.strip())
        shutil.copy(src, dst)

    except FileNotFoundError:
        print(f"File not found: {src}")
        # 이미지가 없으면 DataFrame에서 제거 후 CSV 갱신
        df.drop(df[df['image_path'] == i].index, inplace=True)
        df.to_csv(f"{OUTPUT_DIR}/{FILE_LABEL}.csv", index=False)

    except UnidentifiedImageError:
        print(f"Cannot identify image file (corrupted?): {src}")
        # 열 수 없는 이미지도 동일하게 제거 & CSV 갱신
        df.drop(df[df['image_path'] == i].index, inplace=True)
        df.to_csv(f"{OUTPUT_DIR}/{FILE_LABEL}.csv", index=False)

    except Exception as e:
        # 그 외 예기치 못한 에러 로깅
        print(f"Unexpected error for {src}: {e}")
        df.drop(df[df['image_path'] == i].index, inplace=True)
        df.to_csv(f"{OUTPUT_DIR}/{FILE_LABEL}.csv", index=False)

File not found: ./data/images/00270.png
File not found: ./data/images/04361.png
File not found: ./data/images/07697.png
Cannot identify image file (corrupted?): ./data/images/14532.png


In [12]:

from sklearn.model_selection import train_test_split

train_df, tmp = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
valid_df, test_df = train_test_split(tmp, test_size=0.5, random_state=42, stratify=tmp['label'])

train_df.to_csv(f"{OUTPUT_DIR}/train.csv", index=False)
valid_df.to_csv(f"{OUTPUT_DIR}/valid.csv", index=False)
test_df.to_csv(f"{OUTPUT_DIR}/test.csv", index=False)


dirs = ['train', 'valid', 'test']
for dir in dirs:
    src_dir = os.path.join(f'./data/images')
    dst_dir = os.path.join(OUTPUT_DIR, dir)
    if os.path.exists(os.path.join(dst_dir)):
        shutil.rmtree(dst_dir)
    os.makedirs(os.path.join(dst_dir), exist_ok=True)

    if dir == 'train':
        df = train_df
    elif dir == 'valid':
        df = valid_df
    else:
        df = test_df

    for i in df['image_path']:
        src = os.path.join(src_dir, i)
        dst = os.path.join(dst_dir, i)

        try:
            shutil.copy(src, dst)
        except FileNotFoundError:
            print(f"File not found: {src}")
            df.drop(df[df['image_path'] == i].index, inplace=True)
            continue


In [13]:
print(train_df['label'].value_counts())
print(valid_df['label'].value_counts())
print(test_df['label'].value_counts())

label
접촉피부염    523
정상피부     465
두드러기     321
건선       209
백선증      202
Name: count, dtype: int64
label
접촉피부염    65
정상피부     58
두드러기     40
건선       26
백선증      26
Name: count, dtype: int64
label
접촉피부염    66
정상피부     59
두드러기     40
건선       26
백선증      25
Name: count, dtype: int64
