### 설정

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 프로젝트 모듈
from preprocessing.topics.topic_modeler import TopicModeler
from preprocessing.topics.utils import (
    cluster_topics,
    create_topic_summary,
    save_topic_keywords,
    create_drama_umap_map
)

# 설정
DRAMA_EMBEDDINGS_PATH = "files/drama/drama_text_embedding_qwen3.parquet"
MOVIE_EMBEDDINGS_PATH = "files/movie/movie_text_embedding_qwen3.parquet"
HIT_SCORE_PATH = "files/00_hit_score.parquet"

OUTPUT_DIR = "files/bertopic_results"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [2]:
def load_data_for_bertopic(embeddings_path, hit_score_path, content_type='drama'):

    # 1. 임베딩 로드
    df_embeddings = pd.read_parquet(embeddings_path)

    # 3. Hit Score 로드
    df_hit = pd.read_parquet(hit_score_path)

    # 4. 데이터 병합
    df_merged = df_embeddings.merge(df_hit[['imdb_id', 'hit_score']], on='imdb_id', how='inner')

    # 5. 필수 컬럼 확인
    required_cols = ['imdb_id', 'title', 'overview', 'embedding', 'hit_score']

    missing_cols = set(required_cols) - set(df_merged.columns)
    if missing_cols:
        print(f"누락된 컬럼: {missing_cols}")
    else:
        print(f"필수 컬럼 모두 존재")

    # 6. 결측치 제거
    before_len = len(df_merged)
    df_merged = df_merged.dropna(subset=['overview', 'embedding', 'hit_score'])
    after_len = len(df_merged)

    if before_len > after_len:
        print(f"결측치 제거: {before_len - after_len:,}개")
    else:
        print(f"결측치 없음")

    return df_merged

### 드라마

In [3]:
drama_data = load_data_for_bertopic(
    embeddings_path=DRAMA_EMBEDDINGS_PATH,
    hit_score_path=HIT_SCORE_PATH,
    content_type='drama'
)
# 흥행/비흥행 기준 설정
hit_threshold = drama_data['hit_score'].quantile(0.8)  # 상위 20%
flop_threshold = drama_data['hit_score'].quantile(0.4)  # 하위 40%

# 라벨링
drama_data['label'] = 'normal'
drama_data.loc[drama_data['hit_score'] >= hit_threshold, 'label'] = 'hit'
drama_data.loc[drama_data['hit_score'] <= flop_threshold, 'label'] = 'flop'

필수 컬럼 모두 존재
결측치 없음


In [4]:
movie_data = load_data_for_bertopic(
    embeddings_path=MOVIE_EMBEDDINGS_PATH,
    hit_score_path=HIT_SCORE_PATH,
    content_type='movie'
)
# 흥행/비흥행 기준 설정
hit_threshold = movie_data['hit_score'].quantile(0.8)
flop_threshold = movie_data['hit_score'].quantile(0.4)

# 라벨링
movie_data['label'] = 'normal'
movie_data.loc[movie_data['hit_score'] >= hit_threshold, 'label'] = 'hit'
movie_data.loc[movie_data['hit_score'] <= flop_threshold, 'label'] = 'flop'

필수 컬럼 모두 존재
결측치 없음


In [None]:
# 흥행작 필터링

# TopicModeler 초기화 및 학습
drama_modeler = TopicModeler(
    data=drama_data,
    type_name='drama'
)

drama_modeler.fit_transform()
drama_clusters, drama_summary = cluster_topics(
    topic_model=drama_modeler.bertopic_model,
    n_groups=5
)

# 결과 저장
drama_modeler.save_results(save_point='drama_total')
drama_clusters.to_csv(f"{OUTPUT_DIR}/drama_total/topic_clusters.csv", index=False, encoding='utf-8-sig')

In [5]:
# 흥행작 필터링
drama_hit = drama_data[drama_data['label'] == 'hit'].copy()

# TopicModeler 초기화 및 학습
drama_hit_modeler = TopicModeler(
    data=drama_hit,
    type_name='drama'
)

drama_hit_modeler.fit_transform()
drama_hit_clusters, drama_hit_summary = cluster_topics(
    topic_model=drama_hit_modeler.bertopic_model,
    n_groups=5
)

# 결과 저장
drama_hit_modeler.save_results(save_point='drama_hit')
drama_hit_clusters.to_csv(f"{OUTPUT_DIR}/drama_hit/topic_clusters.csv", index=False, encoding='utf-8-sig')

2025-12-28 21:45:10,046 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-28 21:45:16,677 - BERTopic - Dimensionality - Completed ✓
2025-12-28 21:45:16,678 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-28 21:45:16,699 - BERTopic - Cluster - Completed ✓
2025-12-28 21:45:16,702 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-28 21:45:16,763 - BERTopic - Representation - Completed ✓


  ✓ files/bertopic_result/drama_hit/drama_hit_topics.parquet
  ✓ files/bertopic_result/drama_hit/drama_hit_topic_info.csv
  ✓ files/bertopic_result/drama_hit/bertopic_model/
  ✓ files/bertopic_result/drama_hit/topics_barchart.html
  ✓ files/bertopic_result/drama_hit/topics_intertopic.html


100%|██████████| 8/8 [00:00<00:00, 497.82it/s]


  ✓ files/bertopic_result/drama_hit/topics_hierarchy.html
  ✓ files/bertopic_result/drama_hit/topics_heatmap.html
  ✓ files/bertopic_result/drama_hit/topics_documents.html


In [None]:
# 바흥행작 필터링
drama_flop = drama_data[drama_data['label'] == 'flop'].copy()

# TopicModeler 초기화 및 학습
drama_flop_modeler = TopicModeler(
    data=drama_flop,
    type_name='drama'
)

drama_flop_modeler.fit_transform()
drama_flop_clusters, drama_flop_summary = cluster_topics(
    topic_model=drama_flop_modeler.bertopic_model,
    n_groups=5
)

# 결과 저장
drama_flop_modeler.save_results(save_point='drama_flop')
drama_flop_clusters.to_csv(f"{OUTPUT_DIR}/drama_flop/topic_clusters.csv", index=False, encoding='utf-8-sig')

### 영화

In [None]:

# TopicModeler 초기화 및 학습
movie_modeler = TopicModeler(
    data=movie_data,
    type_name='movie'
)

movie_modeler.fit_transform()

movie_hit_clusters, movie_hit_summary = cluster_topics(
    topic_model=movie_modeler.bertopic_model,
    n_groups=5
)

movie_modeler.save_results(save_point="movie_total")
movie_hit_clusters.to_csv(f"{OUTPUT_DIR}/movie_hit/topic_clusters.csv", index=False, encoding='utf-8-sig')

In [None]:
movie_hit = movie_data[movie_data['label'] == 'hit'].copy()

# TopicModeler 초기화 및 학습
movie_hit_modeler = TopicModeler(
    data=movie_hit,
    type_name='movie'
)

movie_hit_modeler.fit_transform()

movie_hit_clusters, movie_hit_summary = cluster_topics(
    topic_model=movie_hit_modeler.bertopic_model,
    n_groups=5
)

movie_hit_modeler.save_results(save_point="movie_hit")
movie_hit_clusters.to_csv(f"{OUTPUT_DIR}/movie_hit/topic_clusters.csv", index=False, encoding='utf-8-sig')

In [None]:
# 비흥행작 필터링
movie_flop = movie_data[movie_data['label'] == 'flop'].copy()

# TopicModeler 초기화 및 학습
movie_flop_modeler = TopicModeler(
    data=movie_flop,
    type_name='movie'
)

movie_flop_modeler.fit_transform()
movie_flop_clusters, movie_flop_summary = cluster_topics(
    topic_model=movie_flop_modeler.bertopic_model,
    n_groups=5
)

movie_flop_modeler.save_results(save_point='movie_flop')
movie_flop_clusters.to_csv(f"{OUTPUT_DIR}/movie_flop/topic_clusters.csv", index=False, encoding='utf-8-sig')