In [1]:
import os
import time
import math
import requests
from requests.adapters import HTTPAdapter, Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
from dotenv import load_dotenv

# .env 에서 TMDB_API_KEY 불러오기
load_dotenv()
TMDB_API_KEY = os.getenv("TMDB_API_KEY")

if not TMDB_API_KEY:
    raise ValueError("TMDB_API_KEY가 설정되지 않았습니다. .env 파일을 확인하세요.")

BASE_URL = "https://api.themoviedb.org/3"

# 세션 + 재시도 설정 (네트워크 에러 대비)
session = requests.Session()
retries = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)


In [2]:
def tmdb_get(path, params=None):
    """
    TMDB GET 요청 공통 함수
    - 자동으로 api_key 붙여줌
    - 에러 시 None 리턴
    """
    if params is None:
        params = {}
    params["api_key"] = TMDB_API_KEY
    params.setdefault("language", "en-US")

    url = f"{BASE_URL}{path}"

    try:
        resp = session.get(url, params=params, timeout=10)
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"[tmdb_get error] {url} / {e}")
        return None


In [3]:
def fetch_movies_by_year(year, top_n=500, start_date=None, end_date=None):
    """
    특정 연도(year)에 대해 popularity/vote_count 상위 영화 리스트 조회.
    - TMDB discover/movie 사용
    - 최대 top_n 개까지 가져옴
    """
    results = []
    page = 1
    per_page = 20  # TMDB 한 페이지당 20편

    while len(results) < top_n:
        params = {
            "sort_by": "vote_count.desc",   # 인기순
            "page": page,
            "include_adult": "false",
            "include_video": "false",
        }

        # 기본은 연도만, 필요시 날짜 범위 지정
        if start_date or end_date:
            if start_date:
                params["primary_release_date.gte"] = start_date
            if end_date:
                params["primary_release_date.lte"] = end_date
        else:
            params["primary_release_year"] = year

        data = tmdb_get("/discover/movie", params=params)
        if not data or "results" not in data:
            break

        page_results = data["results"]
        if not page_results:
            break

        for m in page_results:
            results.append(
                {
                    "id": m.get("id"),
                    "title": m.get("title"),
                    "original_title": m.get("original_title"),
                    "original_language": m.get("original_language"),
                    "release_date": m.get("release_date"),
                    "popularity": m.get("popularity"),
                    "vote_count": m.get("vote_count"),
                    "vote_average": m.get("vote_average"),
                    "genre_ids": m.get("genre_ids", []),
                    "adult": m.get("adult"),
                    "video": m.get("video"),
                }
            )
            if len(results) >= top_n:
                break

        total_pages = data.get("total_pages", 1)
        if page >= total_pages:
            break

        page += 1
        time.sleep(0.05)  # discover 쪽 속도 조절 (가벼운 쉬어가기)

    return results[:top_n]


In [4]:
def fetch_movie_full(movie_id: int):
    """
    한 편의 영화에 대해 상세 정보 수집.
    /movie/{id}?append_to_response=credits,keywords,release_dates 등 사용.
    문제가 생기면 None 리턴.
    """
    params = {
        "append_to_response": "credits,keywords,release_dates"
    }

    data = tmdb_get(f"/movie/{movie_id}", params=params)
    if not data:
        return None

    # 장르 / 제작사 / 나라 / 언어 / 키워드 / 크레딧 등 파싱
    genres = [g.get("name") for g in data.get("genres", [])]
    prod_companies = [c.get("name") for c in data.get("production_companies", [])]
    prod_countries = [c.get("name") for c in data.get("production_countries", [])]
    spoken_languages = [l.get("name") for l in data.get("spoken_languages", [])]

    keywords_block = data.get("keywords", {})
    if isinstance(keywords_block, dict):
        keywords_list = [k.get("name") for k in keywords_block.get("keywords", [])]
    else:
        keywords_list = []

    credits = data.get("credits", {})
    cast_list = credits.get("cast", []) if isinstance(credits, dict) else []
    crew_list = credits.get("crew", []) if isinstance(credits, dict) else []

    # 상위 배우 3명
    top_cast = [c.get("name") for c in cast_list[:3]]
    # 주요 감독(Director)
    directors = [c.get("name") for c in crew_list if c.get("job") == "Director"]

    result = {
        "id": data.get("id"),
        "imdb_id": data.get("imdb_id"),
        "title": data.get("title"),
        "original_title": data.get("original_title"),
        "original_language": data.get("original_language"),
        "overview": data.get("overview"),
        "tagline": data.get("tagline"),
        "status": data.get("status"),
        "release_date": data.get("release_date"),
        "runtime": data.get("runtime"),
        "budget": data.get("budget"),
        "revenue": data.get("revenue"),
        "popularity": data.get("popularity"),
        "vote_count": data.get("vote_count"),
        "vote_average": data.get("vote_average"),
        "adult": data.get("adult"),
        "homepage": data.get("homepage"),
        "genres": genres,
        "production_companies": prod_companies,
        "production_countries": prod_countries,
        "spoken_languages": spoken_languages,
        "keywords": keywords_list,
        "top_cast": top_cast,
        "director": directors[0] if directors else None,
        "all_directors": directors,
    }

    return result


In [5]:
TOP_N_PER_YEAR = 500

all_list_rows = []

# 2005 ~ 2024
for year in range(2005, 2025):
    print(f"[list] {year}년 수집 시작...")
    movies = fetch_movies_by_year(year, top_n=TOP_N_PER_YEAR)
    for m in movies:
        m["release_year"] = year
    all_list_rows.extend(movies)
    print(f"[list] {year}년 수집 완료: {len(movies)}편")

# 2025년: 1월1일 ~ 11월26일
print("[list] 2025년(부분 기간) 수집 시작...")
movies_2025 = fetch_movies_by_year(
    2025,
    top_n=TOP_N_PER_YEAR,
    start_date="2025-01-01",
    end_date="2025-11-26",
)
for m in movies_2025:
    m["release_year"] = 2025
all_list_rows.extend(movies_2025)
print(f"[list] 2025년 수집 완료: {len(movies_2025)}편")

df_list = pd.DataFrame(all_list_rows)
print("연도별 Top 리스트(중복 포함) 행 수:", len(df_list))

# id 기준 중복 제거
before = len(df_list)
df_list = df_list.drop_duplicates(subset=["id"])
after = len(df_list)
print(f"id 기준 중복 제거: {before} → {after}")

df_list[["id", "title", "release_date", "release_year", "popularity"]].head()


[list] 2005년 수집 시작...
[list] 2005년 수집 완료: 500편
[list] 2006년 수집 시작...
[list] 2006년 수집 완료: 500편
[list] 2007년 수집 시작...
[list] 2007년 수집 완료: 500편
[list] 2008년 수집 시작...
[list] 2008년 수집 완료: 500편
[list] 2009년 수집 시작...
[list] 2009년 수집 완료: 500편
[list] 2010년 수집 시작...
[list] 2010년 수집 완료: 500편
[list] 2011년 수집 시작...
[list] 2011년 수집 완료: 500편
[list] 2012년 수집 시작...
[list] 2012년 수집 완료: 500편
[list] 2013년 수집 시작...
[list] 2013년 수집 완료: 500편
[list] 2014년 수집 시작...
[list] 2014년 수집 완료: 500편
[list] 2015년 수집 시작...
[list] 2015년 수집 완료: 500편
[list] 2016년 수집 시작...
[list] 2016년 수집 완료: 500편
[list] 2017년 수집 시작...
[list] 2017년 수집 완료: 500편
[list] 2018년 수집 시작...
[list] 2018년 수집 완료: 500편
[list] 2019년 수집 시작...
[list] 2019년 수집 완료: 500편
[list] 2020년 수집 시작...
[list] 2020년 수집 완료: 500편
[list] 2021년 수집 시작...
[list] 2021년 수집 완료: 500편
[list] 2022년 수집 시작...
[list] 2022년 수집 완료: 500편
[list] 2023년 수집 시작...
[list] 2023년 수집 완료: 500편
[list] 2024년 수집 시작...
[list] 2024년 수집 완료: 500편
[list] 2025년(부분 기간) 수집 시작...
[list] 2025년 수집 완료: 500편
연도별 To

Unnamed: 0,id,title,release_date,release_year,popularity
0,272,Batman Begins,2005-06-10,2005,14.3261
1,674,Harry Potter and the Goblet of Fire,2005-11-16,2005,23.2812
2,118,Charlie and the Chocolate Factory,2005-07-13,2005,14.6816
3,1895,Star Wars: Episode III - Revenge of the Sith,2005-05-17,2005,10.4083
4,953,Madagascar,2005-05-25,2005,14.5757


In [6]:
movie_ids = df_list["id"].astype(int).tolist()

detail_rows = []
total = len(movie_ids)
start = time.time()

def fetch_one(mid):
    """
    멀티스레드에서 쓸 래퍼 함수
    - 에러 나면 None 리턴
    """
    try:
        return fetch_movie_full(mid)
    except Exception as e:
        print(f"[error] id={mid}, {e}")
        return None

# TMDB 레이트리밋 고려해서 4~6 정도가 안전
MAX_WORKERS = 5

print(f"[detail] 전체 대상 편수: {total}, 워커 수: {MAX_WORKERS}")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_id = {executor.submit(fetch_one, mid): mid for mid in movie_ids}

    for i, future in enumerate(as_completed(future_to_id), start=1):
        result = future.result()
        if result is not None:
            detail_rows.append(result)

        if i % 50 == 0 or i == total:
            elapsed = time.time() - start
            print(
                f"[detail] {i} / {total} 편 처리 완료 "
                f"(경과 {elapsed/60:.1f}분)"
            )

df_detail = pd.DataFrame(detail_rows)
print("상세정보 수집 행 수:", len(df_detail))
df_detail.head()


[detail] 전체 대상 편수: 10500, 워커 수: 5
[detail] 50 / 10500 편 처리 완료 (경과 0.1분)
[detail] 100 / 10500 편 처리 완료 (경과 0.1분)
[detail] 150 / 10500 편 처리 완료 (경과 0.1분)
[detail] 200 / 10500 편 처리 완료 (경과 0.2분)
[detail] 250 / 10500 편 처리 완료 (경과 0.2분)
[detail] 300 / 10500 편 처리 완료 (경과 0.3분)
[detail] 350 / 10500 편 처리 완료 (경과 0.3분)
[detail] 400 / 10500 편 처리 완료 (경과 0.3분)
[detail] 450 / 10500 편 처리 완료 (경과 0.4분)
[detail] 500 / 10500 편 처리 완료 (경과 0.4분)
[detail] 550 / 10500 편 처리 완료 (경과 0.5분)
[detail] 600 / 10500 편 처리 완료 (경과 0.5분)
[detail] 650 / 10500 편 처리 완료 (경과 0.6분)
[detail] 700 / 10500 편 처리 완료 (경과 0.6분)
[detail] 750 / 10500 편 처리 완료 (경과 0.6분)
[detail] 800 / 10500 편 처리 완료 (경과 0.7분)
[detail] 850 / 10500 편 처리 완료 (경과 0.7분)
[detail] 900 / 10500 편 처리 완료 (경과 0.8분)
[detail] 950 / 10500 편 처리 완료 (경과 0.8분)
[detail] 1000 / 10500 편 처리 완료 (경과 0.8분)
[detail] 1050 / 10500 편 처리 완료 (경과 0.9분)
[detail] 1100 / 10500 편 처리 완료 (경과 0.9분)
[detail] 1150 / 10500 편 처리 완료 (경과 1.0분)
[detail] 1200 / 10500 편 처리 완료 (경과 1.0분)
[detail] 1250 / 10500 편 처리

Unnamed: 0,id,imdb_id,title,original_title,original_language,overview,tagline,status,release_date,runtime,...,adult,homepage,genres,production_companies,production_countries,spoken_languages,keywords,top_cast,director,all_directors
0,272,tt0372784,Batman Begins,Batman Begins,en,"Driven by tragedy, billionaire Bruce Wayne ded...",Evil fears the knight.,Released,2005-06-10,140,...,False,https://www.warnerbros.com/movies/batman-begins/,"[Drama, Crime, Action]","[Warner Bros. Pictures, DC, Syncopy, Legendary...","[United Kingdom, United States of America]","[English, اردو, 普通话]","[martial arts, undercover, loss of loved one, ...","[Christian Bale, Michael Caine, Liam Neeson]",Christopher Nolan,[Christopher Nolan]
1,411,tt0363771,"The Chronicles of Narnia: The Lion, the Witch ...","The Chronicles of Narnia: The Lion, the Witch ...",en,"Siblings Lucy, Edmund, Susan and Peter step th...",The beloved masterpiece comes to life.,Released,2005-12-07,143,...,False,,"[Adventure, Family, Fantasy]","[Walt Disney Pictures, Walden Media, C.S. Lewi...","[United Kingdom, United States of America]","[English, Deutsch]","[witch, sibling relationship, saving the world...","[William Moseley, Anna Popplewell, Skandar Key...",Andrew Adamson,[Andrew Adamson]
2,1895,tt0121766,Star Wars: Episode III - Revenge of the Sith,Star Wars: Episode III - Revenge of the Sith,en,The evil Darth Sidious enacts his final plan f...,Fear leads to anger. Anger leads to hate. Hate...,Released,2005-05-17,140,...,False,http://www.starwars.com/films/star-wars-episod...,"[Adventure, Action, Science Fiction]",[Lucasfilm Ltd.],[United States of America],[English],"[showdown, lava, fight, politics, volcano, cho...","[Hayden Christensen, Ewan McGregor, Natalie Po...",George Lucas,[George Lucas]
3,118,tt0367594,Charlie and the Chocolate Factory,Charlie and the Chocolate Factory,en,A young boy wins a tour through the most magni...,Prepare for a taste of adventure.,Released,2005-07-13,115,...,False,https://www.warnerbros.com/charlie-and-chocola...,"[Adventure, Comedy, Family, Fantasy]","[Warner Bros. Pictures, Village Roadshow Pictu...","[United Kingdom, United States of America]",[English],"[factory worker, london, england, based on nov...","[Johnny Depp, Freddie Highmore, David Kelly]",Tim Burton,[Tim Burton]
4,953,tt0351283,Madagascar,Madagascar,en,Four animal friends get a taste of the wild li...,They weren't born in the wild... They were shi...,Released,2005-05-25,86,...,False,https://www.dreamworks.com/movies/madagascar,"[Family, Animation, Adventure, Comedy]","[Pacific Data Images, DreamWorks Animation]",[United States of America],"[English, Français, Deutsch, Português, Español]","[friendship, island, escape, africa, lion, zoo...","[Ben Stiller, Chris Rock, David Schwimmer]",Eric Darnell,"[Eric Darnell, Tom McGrath]"


In [7]:
# df_list 에는 release_year, 기본 리스트 정보가 있고
# df_detail 에는 상세 정보가 있음
# id 기준으로 left merge

df_final = pd.merge(
    df_detail,
    df_list[["id", "release_year", "title", "original_title", "popularity", "vote_count", "vote_average"]],
    on="id",
    how="left",
    suffixes=("_detail", "_list")
)

print("최종 데이터프레임 행/열:", df_final.shape)
df_final.head()

# CSV 저장 (파일명은 캔이 원하는대로 변경 가능)
output_path = "tmdb_top500_per_year_full_2005_2025.csv"
df_final.to_csv(output_path, index=False, encoding="utf-8-sig")
print("저장 완료:", output_path)


최종 데이터프레임 행/열: (10500, 31)
저장 완료: tmdb_top500_per_year_full_2005_2025.csv


In [13]:
df_final[(df_final['revenue'] > 1)&(df_final['budget']>1)].shape

(4533, 31)

In [15]:
df_final

Unnamed: 0,id,imdb_id,title_detail,original_title_detail,original_language,overview,tagline,status,release_date,runtime,...,keywords,top_cast,director,all_directors,release_year,title_list,original_title_list,popularity_list,vote_count_list,vote_average_list
0,272,tt0372784,Batman Begins,Batman Begins,en,"Driven by tragedy, billionaire Bruce Wayne ded...",Evil fears the knight.,Released,2005-06-10,140,...,"[martial arts, undercover, loss of loved one, ...","[Christian Bale, Michael Caine, Liam Neeson]",Christopher Nolan,[Christopher Nolan],2005,Batman Begins,Batman Begins,14.3261,21981,7.718
1,411,tt0363771,"The Chronicles of Narnia: The Lion, the Witch ...","The Chronicles of Narnia: The Lion, the Witch ...",en,"Siblings Lucy, Edmund, Susan and Peter step th...",The beloved masterpiece comes to life.,Released,2005-12-07,143,...,"[witch, sibling relationship, saving the world...","[William Moseley, Anna Popplewell, Skandar Key...",Andrew Adamson,[Andrew Adamson],2005,"The Chronicles of Narnia: The Lion, the Witch ...","The Chronicles of Narnia: The Lion, the Witch ...",19.6752,11165,7.100
2,1895,tt0121766,Star Wars: Episode III - Revenge of the Sith,Star Wars: Episode III - Revenge of the Sith,en,The evil Darth Sidious enacts his final plan f...,Fear leads to anger. Anger leads to hate. Hate...,Released,2005-05-17,140,...,"[showdown, lava, fight, politics, volcano, cho...","[Hayden Christensen, Ewan McGregor, Natalie Po...",George Lucas,[George Lucas],2005,Star Wars: Episode III - Revenge of the Sith,Star Wars: Episode III - Revenge of the Sith,10.4083,14480,7.455
3,118,tt0367594,Charlie and the Chocolate Factory,Charlie and the Chocolate Factory,en,A young boy wins a tour through the most magni...,Prepare for a taste of adventure.,Released,2005-07-13,115,...,"[factory worker, london, england, based on nov...","[Johnny Depp, Freddie Highmore, David Kelly]",Tim Burton,[Tim Burton],2005,Charlie and the Chocolate Factory,Charlie and the Chocolate Factory,14.6816,15763,7.040
4,953,tt0351283,Madagascar,Madagascar,en,Four animal friends get a taste of the wild li...,They weren't born in the wild... They were shi...,Released,2005-05-25,86,...,"[friendship, island, escape, africa, lion, zoo...","[Ben Stiller, Chris Rock, David Schwimmer]",Eric Darnell,"[Eric Darnell, Tom McGrath]",2005,Madagascar,Madagascar,14.5757,11368,6.916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10495,1429738,tt35669044,Tee Yai: Born to Be Bad,ตี๋ใหญ่ ฤกษ์ดาวโจร,th,"In 1980s Bangkok, a wily thief stages a series...","Meet the ""King of Thieves"", whose life is comi...",Released,2025-11-12,117,...,[],"[Nattawin Wattanagitiphat, Witsarut Himmarat, ...",Nonzee Nimibutr,[Nonzee Nimibutr],2025,Tee Yai: Born to Be Bad,ตี๋ใหญ่ ฤกษ์ดาวโจร,66.4793,35,4.800
10496,1313229,tt35519445,Nino,Nino,fr,Nino is a young man on a journey to reconnect ...,,Released,2025-09-17,96,...,"[paris, france, cancer, paris, radiology]","[Théodore Pellerin, Salomé Dewaels, Jeanne Bal...",Pauline Loquès,[Pauline Loquès],2025,Nino,Nino,4.6970,35,7.429
10497,1313003,tt32478708,Close To Me,Muori di lei,it,Luca is a forty-year-old teacher experiencing ...,,Released,2025-03-20,103,...,[],"[Riccardo Scamarcio, Mariela Garriga, Maria Ch...",Stefano Sardo,[Stefano Sardo],2025,Close To Me,Muori di lei,3.4445,35,5.686
10498,1309410,tt35404638,100 Million!,100 Millions !,fr,A printing house worker and union leader inher...,,Released,2025-03-26,97,...,[],"[Kad Merad, Michèle Laroque, Martin Karmann]",Nath Dumont,[Nath Dumont],2025,100 Million!,100 Millions !,2.1742,35,4.257
