In [19]:
import os
import time
import random
import hashlib
from icrawler.builtin import GoogleImageCrawler

TARGET_IMAGES = 200
ADD_BREAD_SUFFIX = True
base_dir = "raw_images_crawled"

def read_keywords(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def hash_image_bytes(file_path):
    try:
        with open(file_path, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()
    except Exception:
        return None

def get_existing_hashes(folder):
    if not os.path.exists(folder):
        return set()
    return set(filter(None, [
        hash_image_bytes(os.path.join(folder, f))
        for f in os.listdir(folder)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif'))
    ]))

def simple_crawl(keyword, save_dir, target_count):
    existing_hashes = get_existing_hashes(save_dir)
    start_count = len(existing_hashes)
    print(f"🗂️  Existing images: {start_count}")

    crawler = GoogleImageCrawler(storage={'root_dir': save_dir})
    crawler.crawl(
        keyword=keyword,
        max_num=target_count + 50  # 약간 여유롭게 받아서 중복 제거 후 수량 확보
    )

    # 중복 제거: 다운로드 후 다시 체크
    all_files = [
        f for f in os.listdir(save_dir)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif'))
    ]
    new_hashes = {}
    for f in all_files:
        path = os.path.join(save_dir, f)
        h = hash_image_bytes(path)
        if h in existing_hashes or h in new_hashes:
            os.remove(path)
        else:
            new_hashes[h] = f

    total = len(existing_hashes) + len(new_hashes)
    return total

# --- MAIN ---

bread_types = read_keywords("keywords.txt")
os.makedirs(base_dir, exist_ok=True)

for bread in bread_types:
    safe_name = bread.replace(" ", "_").replace("/", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")
    os.makedirs(image_dir, exist_ok=True)

    print(f"▶ Processing: {bread}")
    keyword = f"{bread} bread" if ADD_BREAD_SUFFIX else bread
    total_images = simple_crawl(keyword, image_dir, TARGET_IMAGES)

    print(f"✅ Finished '{bread}' - Total images now: {total_images}")

    if bread != bread_types[-1]:
        time.sleep(random.uniform(3, 6))

print("\n🎉 All crawling completed successfully.")


2025-06-06 13:03:56,631 - INFO - icrawler.crawler - start crawling...
2025-06-06 13:03:56,631 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-06-06 13:03:56,632 - INFO - feeder - thread feeder-001 exit
2025-06-06 13:03:56,633 - INFO - icrawler.crawler - starting 1 parser threads...
2025-06-06 13:03:56,634 - INFO - icrawler.crawler - starting 1 downloader threads...


▶ Processing: 크루아상
🗂️  Existing images: 0


2025-06-06 13:03:57,722 - INFO - parser - parsing result page https://www.google.com/search?q=%ED%81%AC%EB%A3%A8%EC%95%84%EC%83%81+bread&ijn=0&start=0&tbs=&tbm=isch
2025-06-06 13:03:57,907 - INFO - downloader - image #1	https://m.tlj.co.kr/data/product/2018-7-26_event(2).jpg
2025-06-06 13:03:58,039 - INFO - downloader - image #2	https://dapa-magazine.kr/page/vol104/img/sub10_02.jpg
2025-06-06 13:03:58,062 - INFO - downloader - image #3	https://m.tlj.co.kr/data/product/2024-9-30_event(29).jpg
2025-06-06 13:03:58,332 - INFO - downloader - image #4	https://cdn.crowdpic.net/detail-thumb/thumb_d_383929E031C622ECDA1EEDB79BFA5CFD.jpg
2025-06-06 13:03:58,459 - INFO - downloader - image #5	https://oasisprodproduct.edge.naverncp.com/56051/detail/detail_56051_0_7be457fb-46cc-4fe3-ab88-b5e149643111.jpg
2025-06-06 13:03:58,662 - INFO - downloader - image #6	https://cdn.crowdpic.net/detail-thumb/thumb_d_A9F1600561935A2248AD4B5BBEC524D6.jpg
2025-06-06 13:03:58,810 - INFO - downloader - image #7	https

✅ Finished '크루아상' - Total images now: 65


2025-06-06 13:04:35,100 - INFO - icrawler.crawler - start crawling...
2025-06-06 13:04:35,101 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-06-06 13:04:35,102 - INFO - feeder - thread feeder-001 exit
2025-06-06 13:04:35,102 - INFO - icrawler.crawler - starting 1 parser threads...
2025-06-06 13:04:35,104 - INFO - icrawler.crawler - starting 1 downloader threads...


▶ Processing: 크로와상
🗂️  Existing images: 0


2025-06-06 13:04:36,031 - INFO - parser - parsing result page https://www.google.com/search?q=%ED%81%AC%EB%A1%9C%EC%99%80%EC%83%81+bread&ijn=0&start=0&tbs=&tbm=isch
2025-06-06 13:04:36,210 - INFO - downloader - image #1	https://m.tlj.co.kr/data/product/2018-7-26_event(2).jpg
2025-06-06 13:04:37,020 - INFO - downloader - image #2	https://i.ytimg.com/vi/ptzC8dN8hEU/hq720.jpg
2025-06-06 13:04:37,060 - INFO - downloader - image #3	http://imagescdn.gettyimagesbank.com/500/201402/a7348196.jpg
2025-06-06 13:04:37,278 - INFO - downloader - image #4	https://thingotr4652.cdn-nhncommerce.com/data/goods/23/01/03/1000030707/register_detail_077.jpg
2025-06-06 13:04:37,340 - INFO - downloader - image #5	https://www.dessertco.co.kr/data/item/1691459980/thumb-7IaM6riI_500x500.jpg
2025-06-06 13:04:37,572 - INFO - downloader - image #6	https://cdn.crowdpic.net/detail-thumb/thumb_d_2D7B66DBDD6B16834B6E88A4C03316AE.jpg
2025-06-06 13:04:37,806 - INFO - downloader - image #7	https://cdn.crowdpic.net/detail-t

✅ Finished '크로와상' - Total images now: 62


2025-06-06 13:05:18,464 - INFO - icrawler.crawler - start crawling...
2025-06-06 13:05:18,464 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-06-06 13:05:18,465 - INFO - feeder - thread feeder-001 exit
2025-06-06 13:05:18,465 - INFO - icrawler.crawler - starting 1 parser threads...
2025-06-06 13:05:18,466 - INFO - icrawler.crawler - starting 1 downloader threads...


▶ Processing: 크루아상
🗂️  Existing images: 65


2025-06-06 13:05:19,399 - INFO - parser - parsing result page https://www.google.com/search?q=%ED%81%AC%EB%A3%A8%EC%95%84%EC%83%81+bread&ijn=0&start=0&tbs=&tbm=isch
2025-06-06 13:05:19,506 - INFO - downloader - skip downloading file 000001.jpg
2025-06-06 13:05:19,506 - INFO - downloader - skip downloading file 000002.jpg
2025-06-06 13:05:19,507 - INFO - downloader - skip downloading file 000003.jpg
2025-06-06 13:05:19,507 - INFO - downloader - skip downloading file 000004.jpg
2025-06-06 13:05:19,507 - INFO - downloader - skip downloading file 000005.jpg
2025-06-06 13:05:19,508 - INFO - downloader - skip downloading file 000006.jpg
2025-06-06 13:05:19,508 - INFO - downloader - skip downloading file 000007.jpg
2025-06-06 13:05:19,508 - INFO - downloader - skip downloading file 000008.jpg
2025-06-06 13:05:19,508 - INFO - downloader - skip downloading file 000009.jpg
2025-06-06 13:05:19,509 - INFO - downloader - skip downloading file 000010.jpg
2025-06-06 13:05:19,509 - INFO - downloader -

✅ Finished '크루아상' - Total images now: 66


2025-06-06 13:05:38,574 - INFO - icrawler.crawler - start crawling...
2025-06-06 13:05:38,575 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-06-06 13:05:38,576 - INFO - feeder - thread feeder-001 exit
2025-06-06 13:05:38,576 - INFO - icrawler.crawler - starting 1 parser threads...
2025-06-06 13:05:38,578 - INFO - icrawler.crawler - starting 1 downloader threads...


▶ Processing: 크로아상
🗂️  Existing images: 0


2025-06-06 13:05:39,680 - INFO - parser - parsing result page https://www.google.com/search?q=%ED%81%AC%EB%A1%9C%EC%95%84%EC%83%81+bread&ijn=0&start=0&tbs=&tbm=isch
2025-06-06 13:05:39,855 - INFO - downloader - image #1	https://cdn.crowdpic.net/detail-thumb/thumb_d_383929E031C622ECDA1EEDB79BFA5CFD.jpg
2025-06-06 13:05:40,002 - INFO - downloader - image #2	https://cdn.crowdpic.net/detail-thumb/thumb_d_09475F3CEDA14856DDE305D342EF750F.jpg
2025-06-06 13:05:40,093 - INFO - downloader - image #3	https://m.goodsesang.com/web/product/big/202504/6bceaa0b79233d974bc1298927238789.jpg
2025-06-06 13:05:40,193 - INFO - downloader - image #4	https://blog.kakaocdn.net/dn/Lsedv/btsdxiT903z/ItMWGRqqpZN55l9VuwKiJ1/img.jpg
2025-06-06 13:05:40,361 - INFO - downloader - image #5	https://thingool123.godohosting.com/data/goods/20/10/42/1000015592/1000015592_detail_054.jpg
2025-06-06 13:05:40,408 - INFO - downloader - image #6	http://image.hnsmall.com/images/goods/424/21141424_g.jpg
2025-06-06 13:05:40,464 - 

✅ Finished '크로아상' - Total images now: 63


2025-06-06 13:06:16,722 - INFO - icrawler.crawler - start crawling...
2025-06-06 13:06:16,723 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-06-06 13:06:16,723 - INFO - feeder - thread feeder-001 exit
2025-06-06 13:06:16,723 - INFO - icrawler.crawler - starting 1 parser threads...
2025-06-06 13:06:16,724 - INFO - icrawler.crawler - starting 1 downloader threads...


▶ Processing: croissant
🗂️  Existing images: 123


2025-06-06 13:06:17,747 - INFO - parser - parsing result page https://www.google.com/search?q=croissant+bread&ijn=0&start=0&tbs=&tbm=isch
2025-06-06 13:06:18,156 - INFO - downloader - image #1	https://sallysbakingaddiction.com/wp-content/uploads/2022/03/croissant-bread-3.jpg
2025-06-06 13:06:18,160 - INFO - downloader - skip downloading file 000002.jpg
2025-06-06 13:06:18,566 - INFO - downloader - image #3	https://thewoksoflife.com/wp-content/uploads/2021/12/milk-bread-croissants-34.jpg
2025-06-06 13:06:18,570 - INFO - downloader - skip downloading file 000004.jpg
2025-06-06 13:06:18,572 - INFO - downloader - skip downloading file 000005.jpg
2025-06-06 13:06:18,572 - INFO - downloader - skip downloading file 000006.jpg
2025-06-06 13:06:18,730 - INFO - downloader - image #7	https://breadsandsweets.com/wp-content/uploads/2021/08/croissant-pic-4.jpg
2025-06-06 13:06:18,733 - INFO - downloader - skip downloading file 000008.jpg
2025-06-06 13:06:18,734 - INFO - downloader - skip downloading

✅ Finished 'croissant' - Total images now: 132

🎉 All crawling completed successfully.


In [None]:
import os
import hashlib

def hash_file(filepath):
    try:
        with open(filepath, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def get_image_files(folder):
    return [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')) and os.path.isfile(os.path.join(folder, f))
    ]

def remove_duplicates(folder1, folder2):
    print(f"Scanning folder1: {folder1}")
    print(f"Scanning folder2: {folder2}")
    
    hashes_folder1 = {}
    for f in get_image_files(folder1):
        h = hash_file(f)
        if h:
            hashes_folder1[h] = f

    removed = 0
    for f in get_image_files(folder2):
        h = hash_file(f)
        if h in hashes_folder1:
            print(f"🗑️ Removing duplicate: {f}")
            os.remove(f)
            removed += 1

    print(f"\n✅ Done. {removed} duplicate files removed from '{folder2}'.")

# 사용 예시
folder_path1 = "/Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/cookie"
folder_path2 = "/Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/chuross/images"

remove_duplicates(folder_path1, folder_path2)


Scanning folder1: /Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/chuross
Scanning folder2: /Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/churros/images

✅ Done. 0 duplicate files removed from '/Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/churros/images'.


In [20]:
import os
import cv2
import numpy as np
import hashlib
from glob import glob
from tqdm import tqdm

# ----------- 중복 제거 함수들 -----------

def hash_file(filepath):
    try:
        with open(filepath, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()
    except Exception as e:
        print(f"❌ Error reading {filepath}: {e}")
        return None

def mse(imageA, imageB):
    # 이미지 크기 맞추기
    if imageA.shape != imageB.shape:
        imageB = cv2.resize(imageB, (imageA.shape[1], imageA.shape[0]))
    err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
    err /= float(imageA.shape[0] * imageA.shape[1])
    return err

def get_image_files(folder):
    return [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')) and os.path.isfile(os.path.join(folder, f))
    ]

def remove_internal_duplicates(folder, mse_threshold=50.0):
    print(f"🔍 Removing duplicates in: {folder}")
    
    files = get_image_files(folder)
    hashes = {}
    removed = 0

    for path in tqdm(files, desc="MD5 duplicate check"):
        h = hash_file(path)
        if not h:
            continue

        if h in hashes:
            os.remove(path)
            removed += 1
        else:
            hashes[h] = path

    # 이미지 유사도 기반 중복 제거 (비슷하지만 해시 다른 경우)
    files = get_image_files(folder)
    images = [(f, cv2.imread(f)) for f in files]
    kept = []

    for i in range(len(images)):
        keep = True
        path_i, img_i = images[i]
        if img_i is None:
            continue

        for j in range(i):
            path_j, img_j = images[j]
            if img_j is None:
                continue

            error = mse(img_i, img_j)
            if error < mse_threshold:
                print(f"🗑️ Removing visually duplicate: {path_i} ≈ {path_j} (MSE={error:.2f})")
                os.remove(path_i)
                removed += 1
                keep = False
                break

        if keep:
            kept.append(images[i])

    print(f"✅ {removed} duplicates removed in '{folder}'.")


# ----------- 전처리 함수들 -----------

def is_blurry(image, threshold=100.0):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var() < threshold

def is_too_dark_or_bright(image, dark_threshold=30, bright_threshold=220):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    mean_brightness = np.mean(gray)
    return mean_brightness < dark_threshold or mean_brightness > bright_threshold

def has_sufficient_content(image, min_std=15):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return np.std(gray) > min_std

def process_image(input_path, output_path, target_size=(640, 640)):
    try:
        image = cv2.imread(input_path)
        if image is None:
            return False

        h, w = image.shape[:2]
        if h < 300 or w < 300 or h < target_size[0]*0.75 or w < target_size[1]*0.75:
            return False

        if is_blurry(image) or is_too_dark_or_bright(image) or not has_sufficient_content(image):
            return False

        scale = min(target_size[0]/w, target_size[1]/h)
        new_w, new_h = int(w * scale), int(h * scale)
        resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)

        canvas = np.full((target_size[1], target_size[0], 3), 114, dtype=np.uint8)
        x_offset = (target_size[0] - new_w) // 2
        y_offset = (target_size[1] - new_h) // 2
        canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized

        cv2.imwrite(output_path, canvas)
        return True
    except Exception as e:
        print(f"❌ Error processing {os.path.basename(input_path)}: {e}")
        return False

def preprocess_category_images(bread_name, image_dir, target_size=(640, 640)):
    print(f"\n📦 Preprocessing images for: {bread_name}")
    
    remove_internal_duplicates(image_dir)

    image_paths = sorted(glob(os.path.join(image_dir, "*.jpg")))
    if not image_paths:
        print(f"⚠️ No images found for {bread_name}")
        return 0

    processed_dir = os.path.join(os.path.dirname(image_dir), "processed")
    os.makedirs(processed_dir, exist_ok=True)

    success = 0
    for idx, input_path in enumerate(tqdm(image_paths, desc=f"Processing {bread_name}")):
        output_filename = f"{idx:04d}.jpg"
        output_path = os.path.join(processed_dir, output_filename)

        if process_image(input_path, output_path, target_size):
            success += 1
        else:
            if os.path.exists(output_path):
                os.remove(output_path)

    print(f"✅ {bread_name}: {success} images successfully processed.\n")
    return success


# ----------- 실행 -----------

# 기본 설정
base_dir = "/Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled"
bread_types = sorted([name for name in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, name))])

total_processed = 0

for bread in bread_types:
    image_dir = os.path.join(base_dir, bread, "images")
    if os.path.exists(image_dir):
        count = preprocess_category_images(bread, image_dir)
        total_processed += count
    else:
        print(f"⚠️ Directory not found: {image_dir}")

print(f"\n🎉 All preprocessing completed. Total images: {total_processed}")



📦 Preprocessing images for: croissant
🔍 Removing duplicates in: /Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/croissant/images


MD5 duplicate check: 100%|██████████| 196/196 [00:00<00:00, 1070.91it/s]


🗑️ Removing visually duplicate: /Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/croissant/images/000010.jpg ≈ /Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/croissant/images/ 00013.jpg (MSE=2.41)
✅ 15 duplicates removed in '/Users/hong/Desktop/25-1/AI intro/AI project/Image-Crawler/v2/raw_images_crawled/croissant/images'.


Processing croissant: 100%|██████████| 172/172 [00:02<00:00, 75.77it/s]

✅ croissant: 130 images successfully processed.


🎉 All preprocessing completed. Total images: 130



