In [36]:
import pandas as pd
from pathlib import Path
from IPython.display import display
import os
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
BASE_DIR = Path("..") # src 기준
DATA_DIR = BASE_DIR / "data"

VIDEO_META_PATH = DATA_DIR / "channel_videos_with_stats.csv"

df_videos_full = pd.read_csv(VIDEO_META_PATH)

df_videos_full.shape


(10371, 12)

In [3]:
metrics = ["view_count", "like_count", "comment_count"]

for col in metrics:
    print(f"\n=== {col} ===")
    print("mean:", df_videos_full[col].mean())
    print("std :", df_videos_full[col].std())
    print("median:", df_videos_full[col].median())
    print("90%:", df_videos_full[col].quantile(0.9))
    print("95%:", df_videos_full[col].quantile(0.95))


=== view_count ===
mean: 506047.0724134606
std : 1172463.5224575703
median: 234229.0
90%: 975756.0
95%: 1676500.5

=== like_count ===
mean: 22542.369009738693
std : 38433.97926105028
median: 12002.0
90%: 44755.0
95%: 71140.5

=== comment_count ===
mean: 574.3854980233343
std : 1013.8170662675028
median: 360.0
90%: 1064.0
95%: 1623.5


In [4]:
view_threshold = df_videos_full["view_count"].quantile(0.9)
like_threshold = df_videos_full["like_count"].quantile(0.9)
comment_threshold = df_videos_full["comment_count"].quantile(0.9)
print("=== Thresholds ===")
print("view_th:", view_threshold)
print("like_th:", like_threshold)
print("comment_th:", comment_threshold)

=== Thresholds ===
view_th: 975756.0
like_th: 44755.0
comment_th: 1064.0


In [5]:
df_target = df_videos_full[
    (df_videos_full["view_count"] >= view_threshold) &
    (df_videos_full["like_count"] >= like_threshold) &
    (df_videos_full["comment_count"] >= comment_threshold)
]

print("Total selected videos:", len(df_target))

Total selected videos: 406


In [15]:
group_counts = (
    df_target
    .groupby("group_label")
    .size()
    .sort_values(ascending=False)
)

print(group_counts)

group_label
IVE            253
RIIZE           61
ISEGYE_IDOL     49
PLAVE           43
dtype: int64


In [7]:
df_target = (
    df_videos_full
    .sort_values("view_count", ascending=False)
    .groupby("group_label")
    .head(30)
    .reset_index(drop=True)
)

# 그룹별 개수 확인
df_target.groupby("group_label").size()

group_label
ISEGYE_IDOL    30
IVE            30
PLAVE          30
RIIZE          30
dtype: int64

In [None]:
target_videos = df_videos_full[
    (df_videos_full["view_count"] >= view_threshold) &
    (df_videos_full["like_count"] >= like_threshold) &
    (df_videos_full["comment_count"] >= comment_threshold)
]

In [26]:
len(target_videos)

NameError: name 'target_videos' is not defined

In [None]:
target_videos["channel_label"].value_counts()

In [None]:
view_threshold = (
    df_videos_full["view_count"].mean() +
    df_videos_full["view_count"].std()
)

like_threshold = (
    df_videos_full["like_count"].mean() +
    df_videos_full["like_count"].std()
)

comment_threshold = (
    df_videos_full["comment_count"].mean() +
    df_videos_full["comment_count"].std()
)


In [6]:
# 채널별 Top-N LONG 영상 선정
TOP_N = 30

top_videos = (
    df_videos_full
    .query("video_type == 'LONG'")
    .sort_values("view_count", ascending=False)
    .groupby("group_label")
    .head(TOP_N)
    .reset_index(drop=True)
)

top_videos[[
    "channel_label",
    "group_label",
    "member_label",
    "video_title",
    "view_count",
    "video_type"
]]


Unnamed: 0,channel_label,group_label,member_label,video_title,view_count,video_type
0,IVE,IVE,,IVE 아이브 'LOVE DIVE' DANCE PRACTICE,38858713,LONG
1,IVE,IVE,,LOVE DIVE,27785691,LONG
2,IVE,IVE,,GOLDEN Covered by IVE ANYUJIN,21672211,LONG
3,IVE,IVE,,IVE 아이브 'ELEVEN' DANCE PRACTICE (Fix ver.),20196652,LONG
4,IVE,IVE,,IVE 아이브 'All Night (Feat. Saweetie)' Official ...,20008768,LONG
...,...,...,...,...,...,...
115,RIIZE,RIIZE,,RIIZE 라이즈 'Show Me Love' MV | ODYSSEY - The 1s...,1363534,LONG
116,RIIZE,RIIZE,,RIIZE 라이즈 'Midnight Mirage' MV | ODYSSEY - The...,1350575,LONG
117,RIIZE,RIIZE,,'Kenshi Yonezu - Eine Kleine' by EUNSEOK,1333921,LONG
118,RIIZE,RIIZE,,‘Leroy Sanchez - Don't Let Me Down’ by WONBIN,1320296,LONG


In [31]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

#display(top_videos[top_videos["member_label"].notna()])

top_videos["member_label"].dropna().value_counts()


member_label
INE           12
LILPA          5
JURURU         3
VIICHAN        3
GOSEGU         2
JINGBURGER     1
Name: count, dtype: int64

In [None]:
from comments import fetch_all_comments

BASE_DIR = Path("..")        # src 기준
DATA_DIR = BASE_DIR / "data"
TOP30_VIDEO_PATH = DATA_DIR / "youtube_top30_all_comments.csv"

all_comments = []

# 파일이 없으면 헤더 포함 생성
if not os.path.exists(TOP30_VIDEO_PATH):
    pd.DataFrame(columns=[
        "video_id",
        "comment_id",
        "comment_text",
        "like_count",
        "published_at",
        "channel_label",
        "video_title"
    ]).to_csv(TOP30_VIDEO_PATH, index=False, encoding="utf-8-sig")

# 이미 수집된 video_id 목록 확인
existing_video_ids = set()

if os.path.exists(TOP30_VIDEO_PATH):
    df_existing = pd.read_csv(TOP30_VIDEO_PATH, usecols=["video_id"])
    existing_video_ids = set(df_existing["video_id"].unique())
print(f"Already collected videos: {len(existing_video_ids)}")

# 영상별 댓글 수집 → 즉시 저장
for i, row in top_videos.iterrows():
    video_id = row["video_id"]
    title = row["video_title"]
    channel = row["channel_label"]

    # 이미 수집된 영상이면 스킵
    if video_id in existing_video_ids:
        print(f"\n[SKIP] {channel} | {title} (already collected)")
        continue
    
    print(f"\n[{i+1}/{len(top_videos)}] {channel} | {title}")

    comments = fetch_all_comments(video_id)
    print(f"→ collected {len(comments)} comments")
    
    # 채널 정보 추가
    for c in comments:
        c["channel_label"] = channel
        c["video_title"] = title
    
    df_batch = pd.DataFrame(comments)
    df_batch.to_csv(
        TOP30_VIDEO_PATH,
        mode="a",          # append 모드
        header=False,      # 헤더 중복 방지
        index=False,
        encoding="utf-8-sig"
    )

    print("→ saved to CSV")

    # 메모리용 누적 (선택사항)
    all_comments.extend(comments)



[1/120] IVE | IVE 아이브 'LOVE DIVE' DANCE PRACTICE
[comments] video=Bo2aD_I7-1U | page 1 | total=100
[comments] video=Bo2aD_I7-1U | page 2 | total=200
[comments] video=Bo2aD_I7-1U | page 3 | total=300
[comments] video=Bo2aD_I7-1U | page 4 | total=400
[comments] video=Bo2aD_I7-1U | page 5 | total=500
[comments] video=Bo2aD_I7-1U | page 6 | total=600
[comments] video=Bo2aD_I7-1U | page 7 | total=700
[comments] video=Bo2aD_I7-1U | page 8 | total=800
[comments] video=Bo2aD_I7-1U | page 9 | total=900
[comments] video=Bo2aD_I7-1U | page 10 | total=1000
[comments] video=Bo2aD_I7-1U | page 11 | total=1100
[comments] video=Bo2aD_I7-1U | page 12 | total=1200
[comments] video=Bo2aD_I7-1U | page 13 | total=1300
[comments] video=Bo2aD_I7-1U | page 14 | total=1400
[comments] video=Bo2aD_I7-1U | page 15 | total=1500
[comments] video=Bo2aD_I7-1U | page 16 | total=1600
[comments] video=Bo2aD_I7-1U | page 17 | total=1700
[comments] video=Bo2aD_I7-1U | page 18 | total=1800
[comments] video=Bo2aD_I7-1U | p

In [33]:
len(all_comments)

526908

In [35]:
df_comments = pd.DataFrame(all_comments)

print("rows:", len(df_comments))
df_comments.head()


# 댓글 메타데이터 저장
df_comments.to_csv(
    "../data/youtube_top30_all_comments.csv",
    index=False,
    encoding="utf-8-sig"
)

print("✅ CSV 저장 완료")

rows: 526908
✅ CSV 저장 완료
