In [1]:
from pathlib import Path
import re

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")  # 노트북에서도 파일 저장 안정화
import matplotlib.pyplot as plt

# ====== 프로젝트 루트 지정 ======
# 노트북을 프로젝트 루트에서 열었다면 "." 그대로 사용
PROJECT_ROOT = Path(".").resolve()

DB_DIR = PROJECT_ROOT / "database"
PLOTS_DIR = PROJECT_ROOT / "review_analysis" / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT =", PROJECT_ROOT)
print("DB_DIR       =", DB_DIR)
print("PLOTS_DIR    =", PLOTS_DIR)


PROJECT_ROOT = C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project\review_analysis\preprocessing
DB_DIR       = C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project\review_analysis\preprocessing\database
PLOTS_DIR    = C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project\review_analysis\preprocessing\review_analysis\plots


In [4]:
from pathlib import Path

def find_project_root(start: Path) -> Path:
    """
    현재 노트북 위치(start)에서 위로 올라가면서
    'database'와 'review_analysis'가 동시에 있는 폴더를 프로젝트 루트로 판단.
    """
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "database").is_dir() and (p / "review_analysis").is_dir():
            return p
    raise FileNotFoundError("프로젝트 루트를 찾지 못했습니다. (database/ 와 review_analysis/ 둘 다 있는 폴더가 없음)")

# 노트북의 현재 작업 디렉토리 기준
CWD = Path(".").resolve()

PROJECT_ROOT = find_project_root(CWD)
DB_DIR = PROJECT_ROOT / "database"
PLOTS_DIR = PROJECT_ROOT / "review_analysis" / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print("CWD         =", CWD)
print("PROJECT_ROOT=", PROJECT_ROOT)
print("DB_DIR      =", DB_DIR)
print("PLOTS_DIR   =", PLOTS_DIR)
print("DB files    =", [p.name for p in DB_DIR.glob("preprocessed_reviews_*.csv")])


CWD         = C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project\review_analysis\preprocessing
PROJECT_ROOT= C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project
DB_DIR      = C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project\database
PLOTS_DIR   = C:\Users\hsmoo\Desktop\　\학회_YBIGTA\Team_Project\2222\YBIGTA_newbie_team_project\review_analysis\plots
DB files    = ['preprocessed_reviews_aladin.csv', 'preprocessed_reviews_kyobo.csv', 'preprocessed_reviews_yes24.csv']


In [5]:
def read_csv_robust(path: Path) -> pd.DataFrame:
    for enc in ("utf-8-sig", "utf-8"):
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    return pd.read_csv(path)

def load_preprocessed(site: str, path: Path) -> pd.DataFrame:
    df = read_csv_robust(path)

    # 필수 컬럼 확인(전처리 결과 파일 기준)
    required = {"rating", "date"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"[{site}] missing columns: {sorted(missing)}")

    # 타입 정리
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["rating", "date"]).copy()

    # content_len이 없을 수 있으니 보정
    if "content_len" not in df.columns:
        base_col = "content_clean" if "content_clean" in df.columns else ("content" if "content" in df.columns else None)
        df["content_len"] = df[base_col].astype(str).str.len() if base_col else 0

    # 사이트 라벨 컬럼
    df["site"] = site
    return df

paths = {
    "aladin": DB_DIR / "preprocessed_reviews_aladin.csv",
    "kyobo":  DB_DIR / "preprocessed_reviews_kyobo.csv",
    "yes24":  DB_DIR / "preprocessed_reviews_yes24.csv",
}

dfs = {site: load_preprocessed(site, p) for site, p in paths.items()}

for site, df in dfs.items():
    print(site, df.shape, df["date"].min().date(), "~", df["date"].max().date())


aladin (545, 211) 2014-05-12 ~ 2026-01-09
kyobo (584, 12) 2025-01-17 ~ 2026-01-17
yes24 (544, 1012) 2014-05-23 ~ 2026-01-17


In [6]:
plt.figure()

for site, df in dfs.items():
    monthly_cnt = df.set_index("date").resample("ME").size()  # 월말 기준
    monthly_cnt.sort_index().plot(label=site)

plt.title("Monthly Review Count by Site")
plt.xlabel("month")
plt.ylabel("count")
plt.legend()
plt.tight_layout()

out = PLOTS_DIR / "compare_monthly_review_count.png"
plt.savefig(out, dpi=200)
plt.close()

out


WindowsPath('C:/Users/hsmoo/Desktop/\u3000/학회_YBIGTA/Team_Project/2222/YBIGTA_newbie_team_project/review_analysis/plots/compare_monthly_review_count.png')

In [7]:
plt.figure()

for site, df in dfs.items():
    monthly_mean = df.set_index("date")["rating"].resample("ME").mean()
    monthly_mean.sort_index().plot(label=site)

plt.title("Monthly Mean Rating by Site")
plt.xlabel("month")
plt.ylabel("mean_rating")
plt.legend()
plt.tight_layout()

out = PLOTS_DIR / "compare_monthly_mean_rating.png"
plt.savefig(out, dpi=200)
plt.close()

out


WindowsPath('C:/Users/hsmoo/Desktop/\u3000/학회_YBIGTA/Team_Project/2222/YBIGTA_newbie_team_project/review_analysis/plots/compare_monthly_mean_rating.png')

In [8]:
def tfidf_columns(df: pd.DataFrame) -> list[str]:
    return [c for c in df.columns if c.startswith("tfidf__")]

def top_tfidf_terms(df: pd.DataFrame, top_n: int = 20) -> pd.Series:
    cols = tfidf_columns(df)
    if not cols:
        raise ValueError("No TF-IDF columns found (tfidf__*)")
    means = df[cols].mean(axis=0).sort_values(ascending=False)
    means.index = means.index.str.replace("^tfidf__", "", regex=True)
    return means.head(top_n)

TOP_N = 20

# 사이트별 top 키워드 bar chart (사이트마다 1장씩 저장)
for site, df in dfs.items():
    top_terms = top_tfidf_terms(df, top_n=TOP_N)

    plt.figure()
    plt.barh(top_terms.index[::-1], top_terms.values[::-1])
    plt.title(f"Top {TOP_N} TF-IDF Terms ({site})")
    plt.xlabel("mean tf-idf")
    plt.tight_layout()

    out = PLOTS_DIR / f"compare_top_tfidf_{site}.png"
    plt.savefig(out, dpi=200)
    plt.close()

print("saved:", [f"compare_top_tfidf_{s}.png" for s in dfs.keys()])


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)
  plt.savefig(out, dpi=200)

ValueError: No TF-IDF columns found (tfidf__*)