## step 1 드라마 줄거리 임베딩

In [None]:
# =========================================================
# RESET RUN (DRAMA ONLY) - Plot Topic (overview) / GPU SAFE
# - 드라마 데이터셋만 따로: 임베딩/토픽 모델 분리
# - max_df < min_df 에러 원천 차단 (토픽 내부 문서수까지 고려)
# - outlier(-1) 과도하면 파라미터만 조금씩 완화하는 방식으로 튜닝
# =========================================================

import os, re
import numpy as np
import pandas as pd

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# -------------------------
# 0) 경로/설정
# -------------------------
PATH_DRAMA = f"{}00_drama_main.parquet"   # 파일명 그대로
OUT_DRAMA  = "drama_with_plot_topic.parquet"
OUT_INFO   = "plot_topic_info_drama.csv"
OUT_KW     = "plot_keywords_drama.parquet"

SEED = 42
MIN_OVERVIEW_LEN = 30
TOP_N_WORDS = 30

# -------------------------
# 1) 전처리
# -------------------------
def clean_text(s: str) -> str:
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# -------------------------
# 2) 로드 + 필터
# -------------------------
drama = pd.read_parquet(PATH_DRAMA).copy()

need_cols = ["imdb_id", "overview"]
missing = [c for c in need_cols if c not in drama.columns]
if missing:
    raise ValueError(f"드라마 데이터에 필요한 컬럼이 없습니다: {missing}")

drama["overview_clean"] = drama["overview"].apply(clean_text)
drama = drama[drama["overview_clean"].str.len() >= MIN_OVERVIEW_LEN].copy()
docs = drama["overview_clean"].tolist()

print("✅ drama docs(after filter):", len(docs))
if len(docs) < 200:
    print("⚠️ 드라마 문서 수가 너무 적으면 토픽 품질이 낮아질 수 있어요.")

# -------------------------
# 3) GPU 임베딩 모델
# -------------------------
embedding_model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B",
    device="cuda"  # GPU 사용
)

# -------------------------
# 4) 드라마 전용 파라미터(안정 + 토픽 뭉개짐 방지)
# -------------------------
# ✅ 핵심: 여기서는 '토픽 내부 문서수'까지 고려해서 에러가 절대 안 나게 설정
# - min_df=1, max_df=1.0  (토픽이 1개 문서여도 CountVectorizer가 안 터짐)
# - 대신 너무 흔한 단어는 stop_words="english" + token_pattern으로 1차 걸러짐
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern=r"\b[a-zA-Z]{3,}\b",
    min_df=1,
    max_df=1.0
)

# 드라마는 문서 수가 작으니 클러스터를 조금 더 잘게 쪼개도록 완화
umap_model = UMAP(
    n_neighbors=15,     # 작을수록 더 로컬(토픽 더 잘게)
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=20,  # 작게: 토픽 수 늘리는 방향
    min_samples=5,
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    top_n_words=TOP_N_WORDS,
    min_topic_size=20,        # 토픽 최소 문서수
    nr_topics=None,
    calculate_probabilities=False,
    verbose=True
)

# -------------------------
# 5) 학습
# -------------------------
topics, _ = topic_model.fit_transform(docs)
drama["plot_topic"] = topics

# -------------------------
# 6) 저장(요약표/키워드/데이터)
# -------------------------
info = topic_model.get_topic_info()
info.to_csv(OUT_INFO, index=False, encoding="utf-8-sig")

kw_rows = []
for tid, tuples in topic_model.get_topics().items():
    words = [w for w, _ in tuples if w]
    kw_rows.append({"plot_topic": tid, "keywords": words})
pd.DataFrame(kw_rows).to_parquet(OUT_KW, index=False)

drama.to_parquet(OUT_DRAMA, index=False)

# -------------------------
# 7) 바로 검증(너가 봐야 할 3개만)
# -------------------------
outlier_ratio = (drama["plot_topic"] == -1).mean()
n_topics_incl = drama["plot_topic"].nunique()
n_topics_excl = drama.loc[drama["plot_topic"] != -1, "plot_topic"].nunique()

print("\n================ CHECK ================")
print("outlier(-1) ratio:", float(outlier_ratio))
print("n_topics (incl -1):", int(n_topics_incl))
print("n_topics (excl -1):", int(n_topics_excl))
print("top topic sizes:\n", pd.Series(topics).value_counts().head(10))
print("saved:", OUT_DRAMA, OUT_INFO, OUT_KW)
print("✅ DONE (DRAMA)")


## step 2 영화 줄거리 임베딩

In [None]:
# =========================================================
# MOVIE RUN (GPU) - Plot Topic (overview) [분리 임베딩]
# - input : 00_movie_main.parquet  (or 네 영화 parquet 경로)
# - output:
#   1) movie_with_plot_topic.parquet
#   2) plot_topic_info_movie.csv
#   3) plot_keywords_movie.parquet
# =========================================================

import re
import numpy as np
import pandas as pd

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# -------------------------
# 0) 경로/설정
# -------------------------
MOVIE_PATH = "00_movie_main.parquet"   
OUT_MOVIE  = "movie_with_plot_topic.parquet"
OUT_INFO   = "plot_topic_info_movie.csv"
OUT_KW     = "plot_keywords_movie.parquet"

SEED = 42
MIN_OVERVIEW_LEN = 30
TOP_N_WORDS = 30

# -------------------------
# 1) GPU 임베딩 모델
# -------------------------
embedding_model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B",
    device="cuda"
)

# -------------------------
# 2) 텍스트 클린
# -------------------------
def clean_text(s: str) -> str:
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# -------------------------
# 3) 로드 + 필터
# -------------------------
movie = pd.read_parquet(MOVIE_PATH).copy()

need_cols = ["imdb_id", "overview"]
missing = [c for c in need_cols if c not in movie.columns]
if missing:
    raise ValueError(f"movie에 필요한 컬럼이 없습니다: {missing}")

movie["overview_clean"] = movie["overview"].apply(clean_text)
movie = movie[movie["overview_clean"].str.len() >= MIN_OVERVIEW_LEN].copy()

docs = movie["overview_clean"].tolist()
n_docs = len(docs)
print("movie docs(after filter):", n_docs)

# -------------------------
# 4) 영화용 파라미터 (드라마보다 보수적으로)
# -------------------------
# vectorizer: 문서 수 비례 min_df (너무 빡세면 토픽이 말라죽음)
MIN_DF = max(5, int(n_docs * 0.002))   # 0.2% ≈ 27k면 ~54
MAX_DF = 0.90

vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern=r"\b[a-zA-Z]{3,}\b",
    min_df=MIN_DF,
    max_df=MAX_DF
)

umap_model = UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    min_samples=10,
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    top_n_words=TOP_N_WORDS,
    min_topic_size=50,
    nr_topics=None,
    calculate_probabilities=False,
    verbose=True
)

print("\n[MOVIE] vectorizer params:", {"min_df": MIN_DF, "max_df": MAX_DF})
print("[MOVIE] cluster/topic params:", {"min_cluster_size": 50, "min_samples": 10, "min_topic_size": 50, "n_neighbors": 20, "n_components": 5})

# -------------------------
# 5) 학습
# -------------------------
topics, _ = topic_model.fit_transform(docs)
movie["plot_topic"] = topics

# -------------------------
# 6) 저장 (분리)
# -------------------------
movie.to_parquet(OUT_MOVIE, index=False)
topic_model.get_topic_info().to_csv(OUT_INFO, index=False, encoding="utf-8-sig")

kw_rows = []
for tid, tuples in topic_model.get_topics().items():
    if tuples is None:
        continue
    words = [w for w, _ in tuples if w]
    kw_rows.append({"plot_topic": tid, "keywords": words})
pd.DataFrame(kw_rows).to_parquet(OUT_KW, index=False)

# -------------------------
# 7) 간단 체크
# -------------------------
outlier_ratio = (movie["plot_topic"] == -1).mean()
n_topics_incl = movie["plot_topic"].nunique()
n_topics_excl = movie.loc[movie["plot_topic"] != -1, "plot_topic"].nunique()

print("\n============== CHECK ==============")
print("outlier(-1) ratio:", float(outlier_ratio))
print("n_topics (incl -1):", int(n_topics_incl))
print("n_topics (excl -1):", int(n_topics_excl))
print("top topic sizes:\n", movie["plot_topic"].value_counts().head(10))
print("saved:", OUT_MOVIE, OUT_INFO, OUT_KW)
print("✅ DONE (MOVIE)")


## step 3 드라마 리뷰 임베딩

In [None]:
# =========================================
# Drama Review BERTopic (50k) + Plot Stopwords 적용
# - plot_keywords_drama.parquet 에서 "모든" 줄거리 키워드 모아서 stopwords로 사용
# - 8GB(4070 laptop) 기준: batch 작게 + 오류시 CPU fallback
# - output:
#   1) review_topic_info_drama_50k.csv
#   2) review_topic_map_drama_50k.parquet
# =========================================

import os, gc, random
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# -------------------------
# 0) 설정
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

REVIEW_PATH  = "drama_review_final.parquet"        # ✅ 드라마 리뷰 파일명
PLOT_KW_PATH = "plot_keywords_drama.parquet"       # ✅ 드라마 줄거리 키워드 파일
TEXT_COL = "review_text"
IMDB_COL = "imdb_id"

SAMPLE_N = 50_000
LEN_MIN, LEN_MAX = 50, 2000

OUT_INFO = "review_topic_info_drama_50k.csv"
OUT_MAP  = "review_topic_map_drama_50k.parquet"

# GPU 환경 안정화(가끔 "CUDA unknown error" 완화용)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# -------------------------
# 1) Plot stopwords 만들기 (핵심!)
# -------------------------
pk = pd.read_parquet(PLOT_KW_PATH)

# keywords 컬럼이 ndarray/list 형태여도 flatten 되게 처리
plot_words = set()
for kws in pk["keywords"].tolist():
    if kws is None:
        continue
    for w in list(kws):
        if not w:
            continue
        w = str(w).strip().lower()
        if w:
            plot_words.add(w)

# 줄거리에서 너무 흔해서 빼는 게 오히려 더 좋은 "범용 서사/형식 단어"는 추가로 제거(선택)
# ✅ 드라마는 시즌/에피소드 언급이 많아서 여기 ban_words를 좀 더 강하게 둠
ban_words = {
    "film","movie","movies",
    "story","stories","plot",
    "series","show","shows",
    "season","seasons","episode","episodes",
    "character","characters",
    # 드라마 리뷰에서 특히 흔한 단어들(필요 없으면 빼도 됨)
    "drama","dramas","tv","television",
}
plot_words = plot_words - ban_words

base_sw = set(ENGLISH_STOP_WORDS)

# 리뷰에서만 제거하고 싶은 "평가/관용어"
review_generic_sw = {
    "like","just","good","really","time","way","watch","watched","watching","people",
    "dont","didnt","doesnt","isnt","wasnt","werent","cant","couldnt","wouldnt",
    "im","ive","youre","theyre","thats","theres","hes","shes","weve","id",
    # 형식/대상 단어(드라마 쪽 강화)
    "film","films","movie","movies","show","shows","series","season","seasons","episode","episodes",
    "drama","dramas","tv","television",
    "story","plot","character","characters",
}

combined_sw = sorted(list(base_sw | plot_words | review_generic_sw))

print(f"[Stopwords] ENGLISH={len(base_sw)}, plot_words={len(plot_words)}, total={len(combined_sw)}")
print("plot_words sample:", list(sorted(list(plot_words)))[:30])

# -------------------------
# 2) 리뷰 로드 + 필터 + 샘플
# -------------------------
df = pd.read_parquet(REVIEW_PATH)

# 텍스트 유효
df = df[df[TEXT_COL].notna()].copy()
df[TEXT_COL] = df[TEXT_COL].astype(str)

# 길이 컷
df["__len"] = df[TEXT_COL].str.len()
df = df[(df["__len"] >= LEN_MIN) & (df["__len"] <= LEN_MAX)].copy()

print("after filter:", len(df))

# 샘플
df_s = df.sample(n=min(SAMPLE_N, len(df)), random_state=SEED).copy()
docs = df_s[TEXT_COL].tolist()

print("sample docs:", len(docs))

# 메모리 정리
del df
gc.collect()

# -------------------------
# 3) BERTopic 구성
# -------------------------
vectorizer_model = CountVectorizer(
    stop_words=combined_sw,
    ngram_range=(1, 2),
    token_pattern=r"\b[a-zA-Z]{3,}\b",
    min_df=20,       # 50k면 10~30 사이 추천
    max_df=0.95,
)

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,     # 안정
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    min_samples=10,
    prediction_data=True
)

# -------------------------
# 4) 임베딩 + fit_transform (GPU 우선, 실패하면 CPU fallback)
# -------------------------
def run_bertopic(docs, device="cuda", enc_batch=16):
    # ✅ Qwen 임베딩 모델
    embedder = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=device)

    with torch.inference_mode():
        embeddings = embedder.encode(
            docs,
            batch_size=enc_batch,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True,
        )

    topic_model = BERTopic(
        embedding_model=None,      # embeddings를 직접 넣을 거라 None
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=False,
        verbose=True
    )

    topics, probs = topic_model.fit_transform(docs, embeddings)
    return topic_model, topics

# 1차: GPU 시도
try:
    torch.cuda.empty_cache()
    model, topics = run_bertopic(docs, device="cuda", enc_batch=16)
except Exception as e:
    print("\n⚠️ GPU 실패 -> CPU로 fallback")
    print("GPU error:", repr(e))
    torch.cuda.empty_cache()
    gc.collect()
    model, topics = run_bertopic(docs, device="cpu", enc_batch=32)

# -------------------------
# 5) 결과 저장
# -------------------------
info = model.get_topic_info()
info.to_csv(OUT_INFO, index=False, encoding="utf-8-sig")

df_s["review_topic"] = topics
df_s.drop(columns=["__len"], inplace=True, errors="ignore")
df_s.to_parquet(OUT_MAP, index=False)

outlier_ratio = (df_s["review_topic"] == -1).mean()
print("\n===== DONE =====")
print("outlier(-1) ratio:", float(outlier_ratio))
print("n_topics(excl -1):", int((info["Topic"] != -1).sum()))
print("saved:", OUT_INFO, OUT_MAP)

# -------------------------
# 6) 간단 검증(토픽 분포 Top15)
# -------------------------
import matplotlib.pyplot as plt

tmp = df_s[df_s["review_topic"] != -1]["review_topic"].value_counts().head(15).sort_index()
plt.figure(figsize=(10,4))
plt.bar(tmp.index.astype(str), tmp.values)
plt.title("Drama Review Topic Distribution (Top 15, excl -1)")
plt.xlabel("Topic")
plt.ylabel("Review Count")
plt.show()


## step 4 영화 리뷰 임베딩

In [None]:
# =========================================
# Movie Review BERTopic (50k) + Plot Stopwords 적용
# - plot_keywords_movie.parquet 에서 "모든" 줄거리 키워드 모아서 stopwords로 사용
# - 8GB(4070 laptop) 기준: batch 작게 + fp16 + 오류시 CPU fallback
# - output:
#   1) review_topic_info_movie_50k.csv
#   2) review_topic_map_movie_50k.parquet
# =========================================

import os, gc, random
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# -------------------------
# 0) 설정
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

REVIEW_PATH = "movie_review_final.parquet"        # ✅ 네 영화 리뷰 파일명에 맞게 수정
PLOT_KW_PATH = "plot_keywords_movie.parquet"  # ✅ 이미 만든 줄거리 키워드 파일
TEXT_COL = "review_text"
IMDB_COL = "imdb_id"

SAMPLE_N = 50_000
LEN_MIN, LEN_MAX = 50, 2000

OUT_INFO = "review_topic_info_movie_50k.csv"
OUT_MAP  = "review_topic_map_movie_50k.parquet"

# GPU 환경 안정화(가끔 "CUDA unknown error" 완화용)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# -------------------------
# 1) Plot stopwords 만들기 (핵심!)
# -------------------------
pk = pd.read_parquet(PLOT_KW_PATH)

# keywords 컬럼이 ndarray/list 형태여도 flatten 되게 처리
plot_words = set()
for kws in pk["keywords"].tolist():
    if kws is None:
        continue
    for w in list(kws):
        if not w:
            continue
        w = str(w).strip().lower()
        if w:
            plot_words.add(w)

# 줄거리에서 너무 흔해서 빼는 게 오히려 더 좋은 "범용 서사/형식 단어"는 추가로 제거(선택)
ban_words = {
    "film","movie","movies","story","stories","series","season","seasons","episode","episodes",
    "character","characters","plot","show","shows"
}
plot_words = plot_words - ban_words

base_sw = set(ENGLISH_STOP_WORDS)

# 리뷰에서만 제거하고 싶은 "평가/관용어" (너가 쓰던 것 있으면 여기 더 붙여도 됨)
review_generic_sw = {
    "like","just","good","really","time","way","watch","watched","watching","people",
    "dont","didnt","doesnt","isnt","wasnt","werent","cant","couldnt","wouldnt",
    "im","ive","youre","theyre","thats","theres","hes","shes","weve","id",
    "film","films","movie","movies","show","shows","series","season","seasons","episode","episodes",
    "story","plot","character","characters",
}

combined_sw = sorted(list(base_sw | plot_words | review_generic_sw))

print(f"[Stopwords] ENGLISH={len(base_sw)}, plot_words={len(plot_words)}, total={len(combined_sw)}")
print("plot_words sample:", list(sorted(list(plot_words)))[:30])

# -------------------------
# 2) 리뷰 로드 + 필터 + 샘플
# -------------------------
df = pd.read_parquet(REVIEW_PATH)

# 텍스트 유효
df = df[df[TEXT_COL].notna()].copy()
df[TEXT_COL] = df[TEXT_COL].astype(str)

# 길이 컷
df["__len"] = df[TEXT_COL].str.len()
df = df[(df["__len"] >= LEN_MIN) & (df["__len"] <= LEN_MAX)].copy()

print("after filter:", len(df))

# 샘플
df_s = df.sample(n=min(SAMPLE_N, len(df)), random_state=SEED).copy()
docs = df_s[TEXT_COL].tolist()

print("sample docs:", len(docs))

# 메모리 정리
del df
gc.collect()

# -------------------------
# 3) BERTopic 구성
# -------------------------
vectorizer_model = CountVectorizer(
    stop_words=combined_sw,
    ngram_range=(1, 2),
    token_pattern=r"\b[a-zA-Z]{3,}\b",
    min_df=20,       # 50k면 10~30 사이 추천
    max_df=0.95,
)

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,     # 안정
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    min_samples=10,
    prediction_data=True
)

# -------------------------
# 4) 임베딩 + fit_transform (GPU 우선, 실패하면 CPU fallback)
# -------------------------
def run_bertopic(docs, device="cuda", enc_batch=16):
    # ✅ Qwen 임베딩 모델 (너가 쓰던 그대로)
    embedder = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=device)

    # 안정: fp16 + 작은 배치
    with torch.inference_mode():
        embeddings = embedder.encode(
            docs,
            batch_size=enc_batch,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True,
        )

    topic_model = BERTopic(
        embedding_model=None,      # embeddings를 직접 넣을 거라 None
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=False,
        verbose=True
    )

    topics, probs = topic_model.fit_transform(docs, embeddings)
    return topic_model, topics

# 1차: GPU 시도
try:
    torch.cuda.empty_cache()
    model, topics = run_bertopic(docs, device="cuda", enc_batch=16)
except Exception as e:
    print("\n⚠️ GPU 실패 -> CPU로 fallback")
    print("GPU error:", repr(e))
    torch.cuda.empty_cache()
    gc.collect()
    model, topics = run_bertopic(docs, device="cpu", enc_batch=32)

# -------------------------
# 5) 결과 저장
# -------------------------
info = model.get_topic_info()
info.to_csv(OUT_INFO, index=False, encoding="utf-8-sig")

df_s["review_topic"] = topics
df_s.drop(columns=["__len"], inplace=True, errors="ignore")
df_s.to_parquet(OUT_MAP, index=False)

outlier_ratio = (df_s["review_topic"] == -1).mean()
print("\n===== DONE =====")
print("outlier(-1) ratio:", float(outlier_ratio))
print("n_topics(excl -1):", int((info["Topic"] != -1).sum()))
print("saved:", OUT_INFO, OUT_MAP)

# -------------------------
# 6) 간단 검증(토픽 분포 Top15)
# -------------------------
import matplotlib.pyplot as plt

tmp = df_s[df_s["review_topic"] != -1]["review_topic"].value_counts().head(15).sort_index()
plt.figure(figsize=(10,4))
plt.bar(tmp.index.astype(str), tmp.values)
plt.title("Review Topic Distribution (Top 15, excl -1)")
plt.xlabel("Topic")
plt.ylabel("Review Count")
plt.show()


## setp5 영화+드라마 클러스터링

In [None]:
# =========================================
# Topic Clustering (INFO only)
# - movie_info / drama_info 각각 따로 클러스터링
# - map 데이터는 건드리지 않음
# - output: *_info_clustered.csv + *_cluster_summary.csv
# =========================================

import ast
import numpy as np
import pandas as pd

from sklearn.cluster import AgglomerativeClustering
from collections import Counter

# ---- 1) 임베딩 모델 (가벼움 + 성능 좋음) ----
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer


# =========================
# 0) 공통 유틸
# =========================
def parse_representation(x):
    """
    Representation 컬럼이
    - 이미 list 이거나
    - 문자열로 저장된 list ("['a','b',...]") 인 경우
    둘 다 안전하게 list[str]로 변환
    """
    if isinstance(x, list):
        return [str(w) for w in x]
    if pd.isna(x):
        return []
    s = str(x).strip()
    # CSV에서 list가 문자열로 저장된 경우가 많음
    try:
        v = ast.literal_eval(s)
        if isinstance(v, list):
            return [str(w) for w in v]
    except Exception:
        pass
    # 그냥 문자열이면 공백 split
    return [w for w in s.replace(",", " ").split() if w]


def make_topic_text(row, topk=20):
    """
    토픽을 대표하는 텍스트 만들기:
    Representation(키워드) 중심 + Name 보조
    """
    reps = parse_representation(row.get("Representation", []))
    name = str(row.get("Name", "")).replace("_", " ")
    reps = reps[:topk]
    # reps가 비면 name이라도 사용
    if reps:
        return " ".join(reps) + " || " + name
    return name


def cluster_keywords(rep_lists, topn=12):
    """
    클러스터 안 토픽들의 Representation 키워드를 모아서
    가장 자주 등장하는 단어 TopN을 클러스터 대표 키워드로 생성
    """
    c = Counter()
    for reps in rep_lists:
        for w in reps:
            w = str(w).strip().lower()
            if w:
                c[w] += 1
    return [w for w, _ in c.most_common(topn)]


def run_topic_clustering(info_df, k=10, seed=42, text_topk=20, kw_topn=12):
    """
    info_df: topic info 데이터프레임 (movie/drama 각각)
    k: 원하는 클러스터 수(8~12 추천)
    반환:
      clustered_df: topic별 cluster_id 포함
      summary_df: cluster별 요약(크기/대표키워드/예시 토픽)
    """
    df = info_df.copy()

    # (1) 표준화: topic 컬럼명 통일
    if "review_topic" not in df.columns and "Topic" in df.columns:
        df = df.rename(columns={"Topic": "review_topic"})

    # (2) 토픽 텍스트 생성
    df["__topic_text"] = df.apply(lambda r: make_topic_text(r, topk=text_topk), axis=1)

    # (3) 임베딩
    model = SentenceTransformer("all-MiniLM-L6-v2")
    emb = model.encode(df["__topic_text"].tolist(), show_progress_bar=True, normalize_embeddings=True)

    # (4) 계층 클러스터링
    # cosine 거리 기반(정규화했으니 euclidean로도 잘 맞음)
    clt = AgglomerativeClustering(n_clusters=k, linkage="average")
    labels = clt.fit_predict(emb)

    df["cluster_id"] = labels

    # (5) 클러스터 요약 생성
    # Representation 파싱
    df["__rep_list"] = df["Representation"].apply(parse_representation) if "Representation" in df.columns else [[]]*len(df)

    rows = []
    for cid, g in df.groupby("cluster_id"):
        rep_kw = cluster_keywords(g["__rep_list"].tolist(), topn=kw_topn)
        # 예시 토픽(가장 count 큰 것 우선)
        count_col = "Count" if "Count" in g.columns else None
        g2 = g.sort_values(count_col, ascending=False) if count_col else g
        example = g2.head(3)[["review_topic", "Name"]].to_dict("records") if "Name" in g2.columns else g2.head(3)[["review_topic"]].to_dict("records")

        rows.append({
            "cluster_id": int(cid),
            "n_topics": int(len(g)),
            "cluster_keywords": ", ".join(rep_kw),
            "example_topics": example
        })

    summary_df = pd.DataFrame(rows).sort_values("n_topics", ascending=False).reset_index(drop=True)

    # (6) 정리: 임시 컬럼 제거
    clustered_df = df.drop(columns=[c for c in ["__topic_text", "__rep_list"] if c in df.columns])

    return clustered_df, summary_df


# =========================
# 1) 실행: 영화/드라마 각각
# =========================
movie_info = pd.read_csv("review_topic_info_movie_50k.csv")
drama_info = pd.read_csv("review_topic_info_drama_50k.csv")

# 추천: 8~12 사이로 시작
movie_clustered, movie_summary = run_topic_clustering(movie_info, k=10)
drama_clustered, drama_summary = run_topic_clustering(drama_info, k=10)

# =========================
# 2) 저장
# =========================
movie_clustered.to_csv("review_topic_info_movie_50k_clustered.csv", index=False)
movie_summary.to_csv("review_topic_cluster_summary_movie.csv", index=False)

drama_clustered.to_csv("review_topic_info_drama_50k_clustered.csv", index=False)
drama_summary.to_csv("review_topic_cluster_summary_drama.csv", index=False)

print("✅ done")
print("movie clusters:", movie_summary.shape, "drama clusters:", drama_summary.shape)


## Step 6 한글 클러스터링 네이밍

In [None]:
import pandas as pd

# =========================
# 1) 한글 클러스터명 매핑 (최종)
# =========================
movie_cluster_ko = {
    1: "전반적 완성도·호불호 평가",
    8: "디즈니·애니메이션 감정 반응",
    0: "전쟁·역사 사실성 평가",
    3: "프랜차이즈·액션 콘텐츠 평가",
    7: "음악·퍼포먼스 완성도 평가",
    2: "시즌·홀리데이 감성 반응",
    6: "원작·각색 비교 평가",
    4: "가족·인물 감정 반응",
    5: "사회·인종 이슈 인식 반응",
    9: "종교·신앙 메시지 해석",
}


drama_cluster_ko = {
    0: "전반적 감상·전개 평가",
    2: "세계관·원작 충실도 평가",
    3: "사실성·논쟁 이슈 반응",
    1: "젠더·정체성 표현 반응",
    5: "청소년·교육적 메시지 인식",
    4: "종교·신앙 메시지 해석",
    6: "레이싱·모터스포츠 시청 몰입 반응",
    7: "실험적 연출·예술성 평가",
    8: "요리·경연 리얼리티 반응",
    9: "가족·코미디 정서 반응",
}


# =========================
# 2) 파일 로드
# =========================
movie_info = pd.read_csv("review_topic_info_movie_50k_clustered.csv")
drama_info = pd.read_csv("review_topic_info_drama_50k_clustered.csv")

movie_summary = pd.read_csv("review_topic_cluster_summary_movie.csv")
drama_summary = pd.read_csv("review_topic_cluster_summary_drama.csv")

# =========================
# 3) 한글 컬럼 추가
# =========================
movie_info["cluster_name_ko"] = movie_info["cluster_id"].map(movie_cluster_ko)
drama_info["cluster_name_ko"] = drama_info["cluster_id"].map(drama_cluster_ko)

movie_summary["cluster_name_ko"] = movie_summary["cluster_id"].map(movie_cluster_ko)
drama_summary["cluster_name_ko"] = drama_summary["cluster_id"].map(drama_cluster_ko)

# =========================
# 4) 누락 체크(필수)
# =========================
def _check_missing(df, label):
    miss = df[df["cluster_name_ko"].isna()]["cluster_id"].unique().tolist()
    if miss:
        print(f"⚠️ {label} 매핑 누락 cluster_id:", miss)
    else:
        print(f"✅ {label} 매핑 OK")

_check_missing(movie_info, "movie_info")
_check_missing(drama_info, "drama_info")
_check_missing(movie_summary, "movie_summary")
_check_missing(drama_summary, "drama_summary")

# =========================
# 5) 저장 (한글 반영본)
# =========================
movie_info.to_csv("review_topic_info_movie_50k_clustered_ko.csv", index=False)
drama_info.to_csv("review_topic_info_drama_50k_clustered_ko.csv", index=False)

movie_summary.to_csv("review_topic_cluster_summary_movie_ko.csv", index=False)
drama_summary.to_csv("review_topic_cluster_summary_drama_ko.csv", index=False)

print("✅ 저장 완료: *_clustered_ko.csv, *_summary_ko.csv")
