In [16]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [17]:
import platform
import matplotlib as mpl
# 한글 폰트 설정

if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'
elif platform.system() == 'Darwin':  # macOS
    plt.rcParams['font.family'] = 'AppleGothic'
else:  # Linux
    plt.rcParams['font.family'] = 'NanumGothic'

mpl.rcParams['axes.unicode_minus'] = False




In [18]:


# Pandas 출력 옵션 설정
pd.set_option('display.max_columns', None)      # 모든 컬럼 표시
pd.set_option('display.max_colwidth', None)     # 컬럼 내용 전체 표시 (잘림 방지)
pd.set_option('display.width', None)            # 출력 너비 제한 해제
pd.set_option('display.max_rows', None)         # 모든 행 표시

# =============================================================================
# 1. 데이터 로드
# =============================================================================
print("=" * 60)
print("1단계: 데이터 로드")
print("=" * 60)


drama_with_topics = pd.read_parquet("drama_with_topics.parquet")
print(f"movie_final shape: {drama_with_topics.shape}")
print(f"movie_final columns: {drama_with_topics.columns.tolist()}")

movie_with_topics = pd.read_parquet("movie_with_topics.parquet")
print(f"movie_final shape: {movie_with_topics.shape}")
print(f"movie_final columns: {movie_with_topics.columns.tolist()}")


1단계: 데이터 로드
movie_final shape: (3317, 35)
movie_final columns: ['id', 'title', 'first_air_date', 'in_production', 'last_air_date', 'last_episode_to_air_vote_average', 'last_episode_to_air_vote_count', 'number_of_episodes', 'number_of_seasons', 'original_language', 'original_name', 'overview', 'popularity', 'poster_path', 'status', 'type_detail', 'imdb_id', 'imdb_rating', 'imdb_rating_count', 'tmdb_rating', 'tmdb_num_votes', 'episode_run_time_average', 'first_year', 'last_year', 'run_years', 'imdb_rating_count_log', 'number_of_episodes_log', 'number_of_seasons_log', 'run_years_log', 'episode_run_time_average_log', 'popularity_log', 'tmdb_num_votes_log', 'genres_combined', 'text', 'topic']
movie_final shape: (26971, 20)
movie_final columns: ['id', 'imdb_id', 'title', 'original_language', 'overview', 'release_date', 'runtime', 'genres', 'keywords', 'poster_path', 'tmdb_rating', 'tmdb_num_votes', 'imdb_rating', 'imdb_num_votes', 'popularity', 'genres_combined', 'combined_text', 'embedding_

In [19]:
hit_score = pd.read_parquet("hit_score.parquet")

In [21]:
hit_score.columns

Index(['imdb_id', 'rating', 'num_votes_log', 'sentiment_score', 'hit_score'], dtype='object')

In [22]:
drama_with_embeddings_final = pd.read_parquet("텍스트마이닝결과\drama_with_embeddings_final.parquet")

In [24]:
movie_with_embeddings_final = pd.read_parquet("텍스트마이닝결과\movie_with_embeddings_final.parquet")

In [25]:
movie_with_embeddings_final.columns

Index(['id', 'imdb_id', 'title', 'original_language', 'overview',
       'release_date', 'runtime', 'genres', 'keywords', 'poster_path',
       'tmdb_rating', 'tmdb_num_votes', 'imdb_rating', 'imdb_num_votes',
       'popularity', 'genres_combined', 'combined_text', 'embedding',
       'embedding_pca_100d'],
      dtype='object')

In [23]:
drama_with_embeddings_final.columns

Index(['id', 'title', 'first_air_date', 'in_production', 'last_air_date',
       'last_episode_to_air_vote_average', 'last_episode_to_air_vote_count',
       'number_of_episodes', 'number_of_seasons', 'original_language',
       'original_name', 'overview', 'popularity', 'poster_path', 'status',
       'type_detail', 'imdb_id', 'imdb_rating', 'imdb_rating_count',
       'tmdb_rating', 'tmdb_num_votes', 'episode_run_time_average',
       'first_year', 'last_year', 'run_years', 'imdb_rating_count_log',
       'number_of_episodes_log', 'number_of_seasons_log', 'run_years_log',
       'episode_run_time_average_log', 'popularity_log', 'tmdb_num_votes_log',
       'genres_combined', 'combined_text', 'embedding'],
      dtype='object')