영화 드라마 BERTopic 분석 파일

실행 확인 완료

### 설정

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
from preprocessing.topics.topic_modeler import TopicModeler
# 프로젝트 모듈
from preprocessing.topics.utils import (
    cluster_topics
)

# 설정
DRAMA_EMBEDDINGS_PATH = "files/drama/08_drama_text_embedding.parquet"
MOVIE_EMBEDDINGS_PATH = "files/movie/09_movie_text_embedding.parquet"
HIT_SCORE_PATH = "files/00_hit_score.parquet"

OUTPUT_DIR = "files/bertopic_result"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [2]:
def load_data_for_bertopic(embeddings_path, hit_score_path, content_type='drama'):

    # 1. 임베딩 로드
    df_embeddings = pd.read_parquet(embeddings_path)

    # 3. Hit Score 로드
    df_hit = pd.read_parquet(hit_score_path)

    # 4. 데이터 병합
    df_merged = df_embeddings.merge(df_hit[['imdb_id', 'hit_score']], on='imdb_id', how='inner')

    # 5. 필수 컬럼 확인
    required_cols = ['imdb_id', 'title', 'overview', 'embedding', 'hit_score']

    missing_cols = set(required_cols) - set(df_merged.columns)
    if missing_cols:
        print(f"누락된 컬럼: {missing_cols}")
    else:
        print(f"필수 컬럼 모두 존재")

    # 6. 결측치 제거
    before_len = len(df_merged)
    df_merged = df_merged.dropna(subset=['overview', 'embedding', 'hit_score'])
    after_len = len(df_merged)

    if before_len > after_len:
        print(f"결측치 제거: {before_len - after_len:,}개")
    else:
        print(f"결측치 없음")

    return df_merged

### 파일 로드 및 라벨링

In [3]:
drama_data = load_data_for_bertopic(
    embeddings_path=DRAMA_EMBEDDINGS_PATH,
    hit_score_path=HIT_SCORE_PATH,
    content_type='drama'
)
# 흥행/비흥행 기준 설정
hit_threshold = drama_data['hit_score'].quantile(0.8)  # 상위 20%
flop_threshold = drama_data['hit_score'].quantile(0.4)  # 하위 40%

# 라벨링
drama_data['label'] = 'normal'
drama_data.loc[drama_data['hit_score'] >= hit_threshold, 'label'] = 'hit'
drama_data.loc[drama_data['hit_score'] <= flop_threshold, 'label'] = 'flop'

필수 컬럼 모두 존재
결측치 없음


In [4]:
movie_data = load_data_for_bertopic(
    embeddings_path=MOVIE_EMBEDDINGS_PATH,
    hit_score_path=HIT_SCORE_PATH,
    content_type='movie'
)
# 흥행/비흥행 기준 설정
hit_threshold = movie_data['hit_score'].quantile(0.8)
flop_threshold = movie_data['hit_score'].quantile(0.4)

# 라벨링
movie_data['label'] = 'normal'
movie_data.loc[movie_data['hit_score'] >= hit_threshold, 'label'] = 'hit'
movie_data.loc[movie_data['hit_score'] <= flop_threshold, 'label'] = 'flop'

필수 컬럼 모두 존재
결측치 없음


### 드라마

In [5]:
# 흥행작 필터링

# TopicModeler 초기화 및 학습
drama_modeler = TopicModeler(
    data=drama_data,
    type_name='drama'
)

drama_modeler.fit_transform()
drama_clusters, drama_summary = cluster_topics(
    topic_model=drama_modeler.bertopic_model,
    n_groups=5
)

# 결과 저장
drama_modeler.save_results(save_point='drama_total')
drama_clusters.to_csv(f"{OUTPUT_DIR}/drama_total/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-29 13:27:17,461 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-29 13:27:35,227 - BERTopic - Dimensionality - Completed ✓
2025-12-29 13:27:35,229 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-29 13:27:35,303 - BERTopic - Cluster - Completed ✓
2025-12-29 13:27:35,305 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-29 13:27:35,566 - BERTopic - Representation - Completed ✓



[클러스터 요약]
 cluster                    topic_num  cnt
       0 [1, 2, 7, 8, 11, 12, 18, 19] 1157
       1   [0, 6, 10, 14, 15, 16, 20] 1227
       2                       [5, 9]  237
       3                   [3, 4, 17]  471
       4                         [13]   52
  ✓ files/bertopic_result/drama_total/drama_total_topics.parquet
  ✓ files/bertopic_result/drama_total/drama_total_topic_info.csv
  ✓ files/bertopic_result/drama_total/bertopic_model/
  ✓ files/bertopic_result/drama_total/topics_barchart.html
  ✓ files/bertopic_result/drama_total/topics_intertopic.html


100%|██████████| 20/20 [00:00<00:00, 409.45it/s]


  ✓ files/bertopic_result/drama_total/topics_hierarchy.html
  ✓ files/bertopic_result/drama_total/topics_heatmap.html
  ✓ files/bertopic_result/drama_total/topics_documents.html


In [6]:
# 흥행작 필터링
drama_hit = drama_data[drama_data['label'] == 'hit'].copy()

# TopicModeler 초기화 및 학습
drama_hit_modeler = TopicModeler(
    data=drama_hit,
    type_name='drama'
)

drama_hit_modeler.fit_transform()
drama_hit_clusters, drama_hit_summary = cluster_topics(
    topic_model=drama_hit_modeler.bertopic_model,
    n_groups=5
)

# 결과 저장
drama_hit_modeler.save_results(save_point='drama_hit')
drama_hit_clusters.to_csv(f"{OUTPUT_DIR}/drama_hit/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-29 13:27:48,213 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-29 13:27:49,329 - BERTopic - Dimensionality - Completed ✓
2025-12-29 13:27:49,333 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-29 13:27:49,350 - BERTopic - Cluster - Completed ✓
2025-12-29 13:27:49,355 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-29 13:27:49,409 - BERTopic - Representation - Completed ✓



[클러스터 요약]
 cluster topic_num  cnt
       0 [0, 4, 7]  223
       1 [3, 6, 8]  176
       2       [2]   65
       3       [5]   44
       4       [1]  119
  ✓ files/bertopic_result/drama_hit/drama_hit_topics.parquet
  ✓ files/bertopic_result/drama_hit/drama_hit_topic_info.csv
  ✓ files/bertopic_result/drama_hit/bertopic_model/
  ✓ files/bertopic_result/drama_hit/topics_barchart.html
  ✓ files/bertopic_result/drama_hit/topics_intertopic.html


100%|██████████| 8/8 [00:00<00:00, 481.23it/s]


  ✓ files/bertopic_result/drama_hit/topics_hierarchy.html
  ✓ files/bertopic_result/drama_hit/topics_heatmap.html
  ✓ files/bertopic_result/drama_hit/topics_documents.html


In [7]:
# 바흥행작 필터링
drama_flop = drama_data[drama_data['label'] == 'flop'].copy()

# TopicModeler 초기화 및 학습
drama_flop_modeler = TopicModeler(
    data=drama_flop,
    type_name='drama'
)

drama_flop_modeler.fit_transform()
drama_flop_clusters, drama_flop_summary = cluster_topics(
    topic_model=drama_flop_modeler.bertopic_model,
    n_groups=5
)

# 결과 저장
drama_flop_modeler.save_results(save_point='drama_flop')
drama_flop_clusters.to_csv(f"{OUTPUT_DIR}/drama_flop/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-29 13:27:50,454 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-29 13:27:53,503 - BERTopic - Dimensionality - Completed ✓
2025-12-29 13:27:53,508 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-29 13:27:53,547 - BERTopic - Cluster - Completed ✓
2025-12-29 13:27:53,552 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-29 13:27:53,692 - BERTopic - Representation - Completed ✓



[클러스터 요약]
 cluster                    topic_num  cnt
       0                   [2, 7, 13]  216
       1 [1, 3, 6, 8, 12, 14, 15, 17]  598
       2            [0, 4, 9, 10, 11]  361
       3                          [5]   67
       4                         [16]   22
  ✓ files/bertopic_result/drama_flop/drama_flop_topics.parquet
  ✓ files/bertopic_result/drama_flop/drama_flop_topic_info.csv
  ✓ files/bertopic_result/drama_flop/bertopic_model/
  ✓ files/bertopic_result/drama_flop/topics_barchart.html
  ✓ files/bertopic_result/drama_flop/topics_intertopic.html


100%|██████████| 17/17 [00:00<00:00, 287.04it/s]


  ✓ files/bertopic_result/drama_flop/topics_hierarchy.html
  ✓ files/bertopic_result/drama_flop/topics_heatmap.html
  ✓ files/bertopic_result/drama_flop/topics_documents.html


### 영화

In [10]:

# TopicModeler 초기화 및 학습
movie_modeler = TopicModeler(
    data=movie_data,
    type_name='movie'
)

movie_modeler.fit_transform()

movie_hit_clusters, movie_hit_summary = cluster_topics(
    topic_model=movie_modeler.bertopic_model,
    n_groups=5
)

movie_modeler.save_results(save_point="movie_total")
movie_hit_clusters.to_csv(f"{OUTPUT_DIR}/movie_total/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-29 13:29:43,479 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-29 13:30:01,971 - BERTopic - Dimensionality - Completed ✓
2025-12-29 13:30:01,994 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-29 13:30:03,204 - BERTopic - Cluster - Completed ✓
2025-12-29 13:30:03,228 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-29 13:30:06,575 - BERTopic - Representation - Completed ✓



[클러스터 요약]
 cluster                            topic_num   cnt
       0 [0, 5, 6, 7, 12, 13, 16, 18, 19, 20] 11677
       1                        [1, 3, 9, 17]  5660
       2                      [4, 11, 14, 15]  5021
       3                                  [2]  1579
       4                              [8, 10]  1515




  ✓ files/bertopic_result/movie_total/movie_total_topics.parquet
  ✓ files/bertopic_result/movie_total/movie_total_topic_info.csv
  ✓ files/bertopic_result/movie_total/bertopic_model/
  ✓ files/bertopic_result/movie_total/topics_barchart.html
  ✓ files/bertopic_result/movie_total/topics_intertopic.html


100%|██████████| 20/20 [00:00<00:00, 79.16it/s]


  ✓ files/bertopic_result/movie_total/topics_hierarchy.html
  ✓ files/bertopic_result/movie_total/topics_heatmap.html
  ✓ files/bertopic_result/movie_total/topics_documents.html


In [11]:
movie_hit = movie_data[movie_data['label'] == 'hit'].copy()

# TopicModeler 초기화 및 학습
movie_hit_modeler = TopicModeler(
    data=movie_hit,
    type_name='movie'
)

movie_hit_modeler.fit_transform()

movie_hit_clusters, movie_hit_summary = cluster_topics(
    topic_model=movie_hit_modeler.bertopic_model,
    n_groups=5
)

movie_hit_modeler.save_results(save_point="movie_hit")
movie_hit_clusters.to_csv(f"{OUTPUT_DIR}/movie_hit/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-29 13:30:20,973 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-29 13:30:26,701 - BERTopic - Dimensionality - Completed ✓
2025-12-29 13:30:26,702 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-29 13:30:26,823 - BERTopic - Cluster - Completed ✓
2025-12-29 13:30:26,825 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-29 13:30:27,241 - BERTopic - Representation - Completed ✓



[클러스터 요약]
 cluster                             topic_num  cnt
       0 [0, 3, 4, 5, 6, 8, 9, 11, 12, 16, 18] 3247
       1                          [14, 15, 19]  487
       2                                  [17]   91
       3                              [10, 13]  314
       4                             [1, 2, 7]  949
  ✓ files/bertopic_result/movie_hit/movie_hit_topics.parquet
  ✓ files/bertopic_result/movie_hit/movie_hit_topic_info.csv
  ✓ files/bertopic_result/movie_hit/bertopic_model/
  ✓ files/bertopic_result/movie_hit/topics_barchart.html
  ✓ files/bertopic_result/movie_hit/topics_intertopic.html


100%|██████████| 19/19 [00:00<00:00, 288.64it/s]


  ✓ files/bertopic_result/movie_hit/topics_hierarchy.html
  ✓ files/bertopic_result/movie_hit/topics_heatmap.html
  ✓ files/bertopic_result/movie_hit/topics_documents.html


In [12]:
# 비흥행작 필터링
movie_flop = movie_data[movie_data['label'] == 'flop'].copy()

# TopicModeler 초기화 및 학습
movie_flop_modeler = TopicModeler(
    data=movie_flop,
    type_name='movie'
)

movie_flop_modeler.fit_transform()
movie_flop_clusters, movie_flop_summary = cluster_topics(
    topic_model=movie_flop_modeler.bertopic_model,
    n_groups=5
)

movie_flop_modeler.save_results(save_point='movie_flop')
movie_flop_clusters.to_csv(f"{OUTPUT_DIR}/movie_flop/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-29 13:30:30,709 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-29 13:30:36,308 - BERTopic - Dimensionality - Completed ✓
2025-12-29 13:30:36,309 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-29 13:30:36,538 - BERTopic - Cluster - Completed ✓
2025-12-29 13:30:36,541 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-29 13:30:37,501 - BERTopic - Representation - Completed ✓



[클러스터 요약]
 cluster                    topic_num  cnt
       0 [0, 1, 2, 3, 10, 11, 12, 18] 5465
       1                      [9, 17]  826
       2    [5, 6, 8, 13, 14, 15, 16] 3191
       3                          [7]  348
       4                          [4]  410
  ✓ files/bertopic_result/movie_flop/movie_flop_topics.parquet
  ✓ files/bertopic_result/movie_flop/movie_flop_topic_info.csv
  ✓ files/bertopic_result/movie_flop/bertopic_model/
  ✓ files/bertopic_result/movie_flop/topics_barchart.html
  ✓ files/bertopic_result/movie_flop/topics_intertopic.html


100%|██████████| 18/18 [00:00<00:00, 162.53it/s]


  ✓ files/bertopic_result/movie_flop/topics_hierarchy.html
  ✓ files/bertopic_result/movie_flop/topics_heatmap.html
  ✓ files/bertopic_result/movie_flop/topics_documents.html
