In [None]:
!pip install beautifulsoup4



In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

KOBIS_API_KEY = os.getenv("KOBIS_API_KEY")

API_MOVIE_LIST = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json"
API_MOVIE_INFO = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json"
API_PEOPLE_LIST = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/people/searchPeopleList.json"

PEOPLE_PAGE = "https://www.kobis.or.kr/kobis/business/mast/mvie/searchMovieList.do"

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; KOBIS-crawler/1.0)"
})

In [None]:
def get_movie_cd(movie_name: str) -> str | None:
    """영화명으로 movieCd(영화코드) 가져오기"""
    params = {
        "key": KOBIS_API_KEY,
        "movieNm": movie_name,
        "itemPerPage": 10,
    }
    r = session.get(API_MOVIE_LIST, params=params)
    r.raise_for_status()
    data = r.json()
    movies = data.get("movieListResult", {}).get("movieList", [])
    if not movies:
        return None

    # 가장 첫 번째 결과 사용 (필요하면 prdtYear, 국가 등으로 추가 필터링)
    return movies[0]["movieCd"]


def get_main_actors(movie_cd: str, max_actors: int = 10) -> list[dict]:
    """movieCd로 영화 상세에서 배우 목록 중 상위 max_actors명을 '주연'처럼 사용"""
    params = {"key": KOBIS_API_KEY, "movieCd": movie_cd}
    r = session.get(API_MOVIE_INFO, params=params)
    r.raise_for_status()
    data = r.json()

    movie_info = data["movieInfoResult"]["movieInfo"]
    actors = movie_info.get("actors", [])

    # KOBIS API는 '주연/조연' 구분이 없으므로, 상위 N명을 대표 배우로 사용
    return actors[:max_actors]


def get_people_cd(actor_name: str, movie_name: str, prefer_role: str = "배우") -> str | None:
    params = {
        "key": KOBIS_API_KEY,
        "peopleNm": actor_name,
        "filmoNames": movie_name,
        "itemPerPage": 10,
    }
    r = session.get(API_PEOPLE_LIST, params=params)
    r.raise_for_status()
    data = r.json()

    people_list = data.get("peopleListResult", {}).get("peopleList", [])
    if not people_list:
        return None

    # 1순위: 영화 제목이 filmoNames에 포함되고, 대표분야가 '배우' 인 사람
    candidates = [
        p for p in people_list
        if movie_name in (p.get("filmoNames") or "")
        and p.get("repRoleNm") == prefer_role
    ]

    # 2순위: 대표분야만 '배우' 인 사람
    if not candidates:
        candidates = [
            p for p in people_list
            if p.get("repRoleNm") == prefer_role
        ]

    # else => 그냥 첫 번째
    if not candidates:
        candidates = people_list

    chosen = candidates[0]
    """
    print(
        f"[get_people_cd] 선택: {chosen.get('peopleNm')} / "
        f"{chosen.get('repRoleNm')} / peopleCd={chosen.get('peopleCd')}"
    )
    """
    return chosen["peopleCd"]

def get_actor_images_from_people_page(people_cd: str) -> list[str]:
    """
    영화인 코드(peopleCd)로 KOBIS 모바일 영화인 상세 페이지를 가져와서
    사람 사진(/common/mast/people/) URL들을 추출.
    """
    DETAIL_URL = "https://www.kobis.or.kr/kobis/mobile/mast/peop/searchPeopleDtl.do"

    params = {"peopleCd": people_cd}
    r = session.get(DETAIL_URL, params=params)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")

    image_urls: list[str] = []

    # 페이지 안의 모든 img 중 사람 사진 경로만 수집
    for img in soup.find_all("img"):
        src = img.get("src") or ""
        if "/common/mast/people/" not in src:
            continue
        full = urljoin(r.url, src)
        if full not in image_urls:
            image_urls.append(full)

    return image_urls

def get_main_actor_images(movie_name: str) -> dict[str, dict[str, str]]:
    """
    영화명 → 배우 → (배역 이름 + 대표 이미지 1장 URL)
    리턴 예시:
    {
        "송강호": {
            "cast": "김기택",
            "image_url": "https://....jpg"
        },
        ...
    }
    """
    movie_cd = get_movie_cd(movie_name)
    if not movie_cd:
        raise ValueError(f"영화명을 찾을 수 없음: {movie_name}")

    # KOBIS movieInfo API에서 배우 목록 가져오기
    main_actors = get_main_actors(movie_cd)
    result: dict[str, dict[str, str]] = {}

    for actor in main_actors:
        name = actor["peopleNm"]               # 배우 이름
        cast = actor.get("cast", "") or ""     # 등장인물 이름(배역명), 없으면 빈 문자열

        print(f"[+] 주연 배우 처리 중: {name} / 배역: {cast}")

        # 배우 + 영화 제목으로 peopleCd 찾기
        people_cd = get_people_cd(name, movie_name)
        if not people_cd:
            print(f"    -> peopleCd를 찾지 못했습니다.")
            continue

        # 영화인 상세 페이지에서 인물 사진 URL 리스트 추출
        img_urls = get_actor_images_from_people_page(people_cd)
        if not img_urls:
            print(f"    -> 배우 사진 URL을 찾지 못했습니다.")
            continue

        # 첫 번째 이미지를 대표 사진으로 사용
        result[name] = {
            "cast": cast,
            "image_url": img_urls[0],
        }

    return result

In [None]:
movie_title = "기생충"
images_by_actor = get_main_actor_images(movie_title)
print(images_by_actor)

[+] 주연 배우 처리 중: 송강호 / 배역: 기택
[+] 주연 배우 처리 중: 이선균 / 배역: 동익
[+] 주연 배우 처리 중: 조여정 / 배역: 연교
[+] 주연 배우 처리 중: 최우식 / 배역: 기우
[+] 주연 배우 처리 중: 박소담 / 배역: 기정
[+] 주연 배우 처리 중: 이정은 / 배역: 문광
[+] 주연 배우 처리 중: 장혜진 / 배역: 충숙
[+] 주연 배우 처리 중: 박명훈 / 배역: 근세
[+] 주연 배우 처리 중: 정지소 / 배역: 다혜
[+] 주연 배우 처리 중: 정현준 / 배역: 다송
{'송강호': {'cast': '기택', 'image_url': 'https://www.kobis.or.kr/common/mast/people/2017/07/thumb_x110/thn_212ae19d375049fa86ffa72bed520a28.jpg'}, '이선균': {'cast': '동익', 'image_url': 'https://www.kobis.or.kr/common/mast/people/2019/09/thumb_x110/thn_0891637b67014d9a9d6b47556c2ad355.jpg'}, '조여정': {'cast': '연교', 'image_url': 'https://www.kobis.or.kr/common/mast/people/2020/02/thumb_x110/thn_352368f0358b4932974e3578330679a6.jpg'}, '최우식': {'cast': '기우', 'image_url': 'https://www.kobis.or.kr/common/mast/people/2020/01/thumb_x110/thn_1f768834f6834c75b9249328e69a18a9.jpg'}, '박소담': {'cast': '기정', 'image_url': 'https://www.kobis.or.kr/common/mast/people/2019/10/thumb_x110/thn_489ac59594a84cbf8f6d76238f97fbf2.jpg'},

In [None]:
import os

os.makedirs("actor_images", exist_ok=True)

movie_title = "기생충"
actors_info = get_main_actor_images(movie_title)
# actors_info: {배우이름: {"cast": 배역명, "image_url": ...}}

for actor, info in actors_info.items():
    cast = info.get("cast", "")
    url = info["image_url"]

    resp = session.get(url)
    resp.raise_for_status()

    ext = os.path.splitext(url)[1] or ".jpg"

    # 파일 = 배우 이름 + 배역명 / 예: 송강호_김기택.jpg
    safe_actor = actor.replace(" ", "_")
    safe_cast = cast.replace(" ", "_") if cast else ""
    if safe_cast:
        filename = f"actor_images/{safe_actor}_{safe_cast}{ext}"
    else:
        filename = f"actor_images/{safe_actor}{ext}"

    with open(filename, "wb") as f:
        f.write(resp.content)

    print(f"saved: {filename} (배우: {actor}, 배역: {cast})")



[+] 주연 배우 처리 중: 송강호 / 배역: 기택
[+] 주연 배우 처리 중: 이선균 / 배역: 동익
[+] 주연 배우 처리 중: 조여정 / 배역: 연교
[+] 주연 배우 처리 중: 최우식 / 배역: 기우
[+] 주연 배우 처리 중: 박소담 / 배역: 기정
[+] 주연 배우 처리 중: 이정은 / 배역: 문광
[+] 주연 배우 처리 중: 장혜진 / 배역: 충숙
[+] 주연 배우 처리 중: 박명훈 / 배역: 근세
[+] 주연 배우 처리 중: 정지소 / 배역: 다혜
[+] 주연 배우 처리 중: 정현준 / 배역: 다송
saved: actor_images/송강호_기택.jpg (배우: 송강호, 배역: 기택)
saved: actor_images/이선균_동익.jpg (배우: 이선균, 배역: 동익)
saved: actor_images/조여정_연교.jpg (배우: 조여정, 배역: 연교)
saved: actor_images/최우식_기우.jpg (배우: 최우식, 배역: 기우)
saved: actor_images/박소담_기정.jpg (배우: 박소담, 배역: 기정)
saved: actor_images/이정은_문광.jpg (배우: 이정은, 배역: 문광)
saved: actor_images/장혜진_충숙.jpg (배우: 장혜진, 배역: 충숙)
saved: actor_images/박명훈_근세.jpg (배우: 박명훈, 배역: 근세)
saved: actor_images/정지소_다혜.jpg (배우: 정지소, 배역: 다혜)
saved: actor_images/정현준_다송.jpg (배우: 정현준, 배역: 다송)


Appendix person labeling

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install insightface onnxruntime-gpu opencv-python

Mounted at /content/drive
Collecting insightface
  Downloading insightface-0.7.3.tar.gz (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.5/439.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting onnx (from insightface)
  Downloading onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.23.2-cp312-cp312-manylinux_2_27_x86_64.m

In [None]:
import cv2
import json
import numpy as np
from pathlib import Path
from insightface.app import FaceAnalysis

# 얼굴 검출 + 인식 모델
app = FaceAnalysis(
    name='buffalo_l',
    root='/content/insightface_model',       # 모델 캐시 경로
    allowed_modules=['detection', 'recognition']
)
app.prepare(
    ctx_id=0,            # GPU 사용 시 0, CPU만 사용이면 -1
    det_size=(640, 640)  # 입력 이미지 리사이즈 기준
)

download_path: /content/insightface_model/models/buffalo_l
Downloading /content/insightface_model/models/buffalo_l.zip from https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip...


100%|██████████| 281857/281857 [00:05<00:00, 56343.86KB/s]


*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/core/providers/cuda/cuda_call.cc:129 std::conditional_t<THRW, void, onnxruntime::common::Status> onnxruntime::CudaCall(ERRTYPE, const char*, const char*, SUCCTYPE, const char*, const char*, int) [with ERRTYPE = cudaError; bool THRW = true; SUCCTYPE = cudaError; std::conditional_t<THRW, void, common::Status> = void] /onnxruntime_src/onnxruntime/core/providers/cuda/cuda_call.cc:121 std::conditional_t<THRW, void, onnxruntime::common::Status> onnxruntime::CudaCall(ERRTYPE, const char*, const char*, SUCCTYPE, const char*, const char*, int) [with ERRTYPE = cudaError; bool THRW = true; SUCCTYPE = cudaError; std::conditional_t<THRW, void, common::Status> = void] CUDA failure 35: CUDA driver version is insufficient for CUDA runtime version ; GPU=-1 ; hostname=a33fec4e780e ; file=/onnxruntime_src/onnxruntime/core/providers/cuda/cuda_execution_provider.cc ; line=282 ; expr=cudaSetDevice(info_.device_id); 

 when using 

In [None]:
ACTOR_DIR = Path("/content/actor_images")

def get_first_face_embedding(img_path: Path, app: FaceAnalysis):
    """이미지에서 첫 번째 얼굴 임베딩 1개만 추출"""
    img = cv2.imread(str(img_path))
    if img is None:
        print(f"[WARN] 이미지 로드 실패: {img_path}")
        return None

    faces = app.get(img)
    if len(faces) == 0:
        print(f"[WARN] 얼굴을 찾지 못함: {img_path}")
        return None

    emb = faces[0].normed_embedding  # (D,)
    return emb

gallery_embeddings = []
gallery_actor_labels = []  # 배우 이름
gallery_cast_labels  = []  # 배역 이름

for img_file in sorted(ACTOR_DIR.glob("*.jpg")):
    stem = img_file.stem        # 예: '송강호_기택'

    # 파일명에서 배우 / 배역 분리
    if "_" in stem:
        actor_name, cast_name = stem.split("_", 1)
    else:
        actor_name, cast_name = stem, ""   # 혹시 _ 없는 경우 대비

    emb = get_first_face_embedding(img_file, app)
    if emb is None:
        continue

    emb = emb / np.linalg.norm(emb)
    gallery_embeddings.append(emb)
    gallery_actor_labels.append(actor_name)
    gallery_cast_labels.append(cast_name)

gallery_embeddings = np.vstack(gallery_embeddings)   # (N, D)

np.save("/content/gallery_embeddings.npy", gallery_embeddings)
with open("/content/gallery_actor_labels.json", "w", encoding="utf-8") as f:
    json.dump(gallery_actor_labels, f, ensure_ascii=False, indent=2)
with open("/content/gallery_cast_labels.json", "w", encoding="utf-8") as f:
    json.dump(gallery_cast_labels, f, ensure_ascii=False, indent=2)

print("등장인물 DB 인원 수:", len(gallery_actor_labels))
print("배우 라벨:", gallery_actor_labels)
print("배역 라벨:", gallery_cast_labels)

등장인물 DB 인원 수: 10
배우 라벨: ['박명훈', '박소담', '송강호', '이선균', '이정은', '장혜진', '정지소', '정현준', '조여정', '최우식']
배역 라벨: ['근세', '기정', '기택', '동익', '문광', '충숙', '다혜', '다송', '연교', '기우']


In [None]:
def match_embedding_to_gallery(face_emb,
                               gallery_embeddings,
                               gallery_actor_labels,
                               gallery_cast_labels,
                               sim_threshold=0.5):
    sims = gallery_embeddings @ face_emb  # (N,)
    best_idx = int(np.argmax(sims))
    best_sim = float(sims[best_idx])

    best_actor = gallery_actor_labels[best_idx]
    best_cast  = gallery_cast_labels[best_idx]

    if best_sim < sim_threshold:
        return "Unknown", "", best_sim   # 배우 Unknown, 배역은 빈 문자열
    return best_actor, best_cast, best_sim

In [None]:
def label_faces_in_thumbnail(img_path: Path,
                             app: FaceAnalysis,
                             gallery_embeddings: np.ndarray,
                             gallery_actor_labels,
                             gallery_cast_labels,
                             sim_threshold: float = 0.2):
    img = cv2.imread(str(img_path))
    if img is None:
        print(f"[WARN] 썸네일 로드 실패: {img_path}")
        return []

    faces = app.get(img)
    results = []

    for i, f in enumerate(faces):
        emb = f.normed_embedding
        emb = emb / np.linalg.norm(emb)

        bbox = f.bbox  # [x1, y1, x2, y2]
        actor_label, cast_label, sim = match_embedding_to_gallery(
            emb,
            gallery_embeddings,
            gallery_actor_labels,
            gallery_cast_labels,
            sim_threshold
        )

        results.append({
            "face_id": i,
            "bbox": [float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])],
            "actor": actor_label,
            "cast": cast_label,
            "similarity": sim
        })

    return results

In [None]:
THUMB_DIR = Path("/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/split_scenes/thumbnails_2")

import csv

output_rows = []

for img_file in sorted(THUMB_DIR.glob("*.jpg")):
    shot_id = img_file.stem           # 예: '001', '002', ...
    faces_info = label_faces_in_thumbnail(
        img_file,
        app,
        gallery_embeddings,
        gallery_actor_labels,
        gallery_cast_labels,
        sim_threshold=0.2
    )

    for face_info in faces_info:
        x1, y1, x2, y2 = face_info["bbox"]
        w = x2 - x1
        h = y2 - y1

        output_rows.append([
            shot_id,
            face_info["face_id"],
            x1, y1, w, h,
            face_info["actor"],   # 배우 이름
            face_info["cast"],    # 배역 이름
            face_info["similarity"]
        ])

csv_path = "/content/face_labeling_0.2.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "shot_id", "face_id", "x", "y", "w", "h",
        "actor", "cast", "similarity"
    ])
    writer.writerows(output_rows)

In [None]:
# Colab 에 나눔폰트 설치
!apt-get -y install fonts-nanum

from PIL import ImageFont, ImageDraw, Image

# 사용할 한글 폰트 경로 (Colab 기준)
FONT_PATH = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
KOREAN_FONT = ImageFont.truetype(FONT_PATH, 20)  # 글자 크기는 필요에 따라 조절

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 2s (6,065 kB/s)
Selecting previously unselected package fonts-nanum.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...


In [None]:
OUT_DIR = Path("/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def visualize_one_thumbnail(img_path: Path,
                            app: FaceAnalysis,
                            gallery_embeddings,
                            gallery_actor_labels,
                            gallery_cast_labels,
                            sim_threshold: float = 0.2):
    # 1) 이미지 로드
    img_bgr = cv2.imread(str(img_path))
    if img_bgr is None:
        return

    faces = app.get(img_bgr)
    if not faces:
        out_path = OUT_DIR / img_path.name
        cv2.imwrite(str(out_path), img_bgr)  # 얼굴 없으면 원본 저장
        return out_path

    # 2) OpenCV(BGR) -> Pillow(RGB) 로 변환
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_rgb)
    draw = ImageDraw.Draw(img_pil)

    # 3) 얼굴마다 bbox + 한글 텍스트 그리기
    for f in faces:
        emb = f.normed_embedding
        emb = emb / np.linalg.norm(emb)
        bbox = f.bbox.astype(int)
        actor_label, cast_label, sim = match_embedding_to_gallery(
            emb,
            gallery_embeddings,
            gallery_actor_labels,
            gallery_cast_labels,
            sim_threshold
        )

        x1, y1, x2, y2 = bbox
        # 박스 (녹색)
        draw.rectangle([(x1, y1), (x2, y2)], outline=(0, 255, 0), width=2)

        # 한글 텍스트
        text = f"{cast_label}({actor_label}) {sim:.2f}"
        text_pos = (x1, max(0, y1 - 25))  # 박스 위쪽에 글자
        draw.text(text_pos, text, font=KOREAN_FONT, fill=(0, 255, 0))

    # 4) 다시 OpenCV(BGR) 로 변환해서 저장
    img_out = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
    out_path = OUT_DIR / img_path.name
    cv2.imwrite(str(out_path), img_out)
    return out_path

# 전체 썸네일 시각화 & 드라이브에 저장
for img_file in sorted(THUMB_DIR.glob("*.jpg")):
    print(visualize_one_thumbnail(
        img_file,
        app,
        gallery_embeddings,
        gallery_actor_labels,
        gallery_cast_labels,
        sim_threshold=0.2
    ))

/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/001.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/002.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/003.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/004.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/005.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/006.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/007.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/008.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/009.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/TAVE 16th/data/total_thumbnail_face_labeling_0.2/010.jpg
/content/drive/MyDrive/Tave 16기 심화프로젝트/T