In [2]:
import pandas as pd

In [3]:
comment1 = pd.read_csv("comments1.csv")
comment2 = pd.read_csv("comments2.csv")
comment3 = pd.read_csv("comments3.csv")
comment4 = pd.read_csv("comments4.csv")
comment5 = pd.read_csv("comments5.csv")

print(comment1.head())
print(comment2.head())
print(comment3.head())
print(comment4.head())
print(comment5.head())

              kind  commentId  channelId  videoId  authorId  \
0  youtube#comment    1781382      14492    74288   2032536   
1  youtube#comment     289571      14727    79618   3043229   
2  youtube#comment     569077       3314    51826    917006   
3  youtube#comment    2957962       5008    58298   1853470   
4  youtube#comment     673093      21411     1265   2584166   

                                        textOriginal  parentCommentId  \
0  PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...              NaN   
1   Apply mashed potato juice and mixed it with curd        3198066.0   
2                         69 missed calls from mars👽              NaN   
3                                               Baaa              NaN   
4    you look like raven from phenomena raven no cap              NaN   

   likeCount                publishedAt                  updatedAt  
0          0  2023-08-15 21:48:52+00:00  2023-08-15 21:48:52+00:00  
1          0  2023-10-02 13:08:22+00:00  202

In [4]:
# --- Imports
import os
import re
import math
import numpy as np
import pandas as pd

# -----------------------------
# Config
# -----------------------------
INPUT_FILES = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv",
]
COMBINED_PARQUET = "comments_combined.parquet"     # compact & fast
CLEANED_PARQUET  = "comments_cleaned.parquet"
SAMPLE_CSV       = "comments_sample_100k.csv"

# Columns to keep (adjust if you need more)
USECOLS = [
    "kind", "commentId", "channelId", "videoId", "authorId",
    "textOriginal", "parentCommentId", "likeCount", "publishedAt", "updatedAt"
]

CHUNKSIZE = 200_000         # tune for your memory
RANDOM_STATE = 42

# -----------------------------
# Helpers
# -----------------------------
url_re = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
whitespace_re = re.compile(r"\s+")
punct_run_re = re.compile(r"([!?.]){2,}")

def clean_text_keep_emojis(s: str) -> str:
    """
    Light cleaning that preserves emojis and non-ASCII characters.
    - lowercases
    - removes URLs
    - normalizes whitespace
    - collapses repeated terminal punctuation
    - keeps emojis, accents, etc.
    """
    if not isinstance(s, str):
        return ""
    x = s
    x = url_re.sub(" ", x)                 # drop URLs
    x = punct_run_re.sub(r"\1", x)         # "!!!" -> "!"
    x = x.replace("\r", " ").replace("\n", " ")
    x = whitespace_re.sub(" ", x).strip()
    x = x.lower()
    return x

# Quick detectors that won't remove emojis
only_emoji_or_punct_re = re.compile(r"^[\W_]+$", re.UNICODE)  # no letters/digits

def is_emoji_or_punct_only(s: str) -> bool:
    if not s:
        return False
    return bool(only_emoji_or_punct_re.match(s))

# -----------------------------
# Topic rules (transparent & auditable)
# Order matters: first match wins.
# -----------------------------
TOPIC_RULES = [
    ("spam_or_promo",      re.compile(r"\b(subscribe|promo|discount|deal|sale|follow me|check my|giveaway)\b|(\.com|http)", re.IGNORECASE)),
    ("request_or_question",re.compile(r"\b(how|what|where|when|why|can you|could you|pls|please|link)\b|\?$", re.IGNORECASE)),
    ("compliment",         re.compile(r"\b(beautiful|pretty|cute|gorgeous|amazing|love|slay|killed it|stunning|awesome|nice|well done|good job)\b", re.IGNORECASE)),
    ("critique",           re.compile(r"\b(terrible|bad|hate|worst|awful|trash|shame|mid)\b", re.IGNORECASE)),
    ("religion",           re.compile(r"\b(mashallah|insha|allah|amen|god bless|hallelujah)\b", re.IGNORECASE)),
    ("nationality_language",re.compile(r"\b(brazil|ind(ia|ian)|american|arab(ic)?|spanish|french|german|russian|chinese|malay|english)\b|🇧🇷|🇮🇳|🇺🇸|🇵🇰|🇧🇩|🇬🇧|🇨🇳|🇲🇾", re.IGNORECASE)),
    ("beauty_and_hair",    re.compile(r"\b(hair|wig|makeup|skincare|mask|serum|blush|foundation|balayage|lipstick|eyeliner|hydration)\b", re.IGNORECASE)),
    ("humor_meme",         re.compile(r"\b(lol|lmao|lmfao|haha|hehe)\b|🤣|😂", re.IGNORECASE)),
]

def assign_topic(row_text: str, parent_comment_id) -> str:
    # emoji-only / punct-only
    if is_emoji_or_punct_only(row_text):
        return "emoji_only"

    # reply thread detector
    if pd.notna(parent_comment_id):
        # still allow other rules to override if clearly matched
        for label, pat in TOPIC_RULES:
            if pat.search(row_text):
                return label
        return "reply"

    # rule-based topics
    for label, pat in TOPIC_RULES:
        if pat.search(row_text):
            return label

    return "other"

# -----------------------------
# 1) Combine CSVs (chunked, memory-safe) → Parquet
# -----------------------------
def combine_csvs_to_parquet(input_files, out_parquet, usecols=None, chunksize=CHUNKSIZE):
    if os.path.exists(out_parquet):
        os.remove(out_parquet)

    writer = None
    try:
        import pyarrow as pa
        import pyarrow.parquet as pq
        for f in input_files:
            for chunk in pd.read_csv(f, usecols=usecols, chunksize=chunksize, dtype_backend="pyarrow"):
                # Ensure consistent dtypes by letting Arrow infer
                table = pa.Table.from_pandas(chunk, preserve_index=False)
                if writer is None:
                    writer = pq.ParquetWriter(out_parquet, table.schema, compression="snappy")
                writer.write_table(table)
    finally:
        if writer is not None:
            writer.close()

# -----------------------------
# 2) Clean + Topic label → Parquet
# -----------------------------
def clean_and_label_to_parquet(in_parquet, out_parquet, chunksize=CHUNKSIZE):
    if os.path.exists(out_parquet):
        os.remove(out_parquet)

    import pyarrow as pa
    import pyarrow.parquet as pq

    reader = pq.ParquetFile(in_parquet)
    writer = None

    # Iterate over row groups to stay memory-safe
    for rg in range(reader.num_row_groups):
        table = reader.read_row_group(rg)
        df = table.to_pandas(types_mapper=pd.ArrowDtype).copy()

        # Light clean
        df["text_clean"] = df["textOriginal"].apply(clean_text_keep_emojis)

        # Topic
        df["topic"] = [
            assign_topic(txt, pid) for txt, pid in zip(df["text_clean"], df.get("parentCommentId"))
        ]

        # Keep only what you need going forward (optional)
        keep = ["commentId", "channelId", "videoId", "authorId",
                "textOriginal", "text_clean", "parentCommentId",
                "likeCount", "publishedAt", "updatedAt", "topic"]
        keep = [c for c in keep if c in df.columns]
        df = df[keep]

        t = pa.Table.from_pandas(df, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(out_parquet, t.schema, compression="snappy")
        writer.write_table(t)

    if writer is not None:
        writer.close()

# -----------------------------
# 3) Proportional stratified sample to EXACT N rows
# -----------------------------
def proportional_stratified_sample(parquet_path, out_csv, n=100_000, random_state=RANDOM_STATE):
    """
    Reads by row groups to avoid loading everything.
    Accumulates per-topic rows, then samples proportionally to topic prevalence.
    """
    import pyarrow.parquet as pq

    # First pass: topic counts
    reader = pq.ParquetFile(parquet_path)
    topic_counts = {}
    total = 0
    for rg in range(reader.num_row_groups):
        df = reader.read_row_group(rg, columns=["topic"]).to_pandas()
        vc = df["topic"].value_counts()
        for k, v in vc.items():
            topic_counts[k] = topic_counts.get(k, 0) + int(v)
        total += len(df)

    # Compute target per-topic sizes (rounded, corrected to exact n)
    topics = sorted(topic_counts.keys())
    proportions = {t: topic_counts[t] / total for t in topics}
    target_raw = {t: proportions[t] * n for t in topics}
    target = {t: int(math.floor(target_raw[t])) for t in topics}
    # Fix rounding to match n exactly
    deficit = n - sum(target.values())
    if deficit > 0:
        # Assign the leftover rows to topics with largest fractional parts
        remainders = sorted(((t, target_raw[t] - target[t]) for t in topics),
                            key=lambda x: x[1], reverse=True)
        for i in range(deficit):
            target[remainders[i % len(remainders)][0]] += 1

    # Second pass: sample within each topic
    # We'll gather per-topic buffers across row groups until we have enough to sample.
    buffers = {t: [] for t in topics}

    # We only need minimal columns in the final sample; keep what you like
    keep_cols = ["commentId", "videoId", "authorId", "textOriginal", "text_clean",
                 "parentCommentId", "likeCount", "publishedAt", "updatedAt", "topic"]

    for rg in range(reader.num_row_groups):
        df = reader.read_row_group(rg).to_pandas()
        if "text_clean" not in df.columns:
            # ensure cleaning/labeling was done
            raise RuntimeError("Expected 'text_clean' in cleaned parquet. Run clean_and_label_to_parquet first.")
        for t in topics:
            needed = max(0, target[t] - sum(len(x) for x in buffers[t]))
            if needed == 0:
                continue
            # take all rows of this topic in the chunk (not sampled yet)
            sub = df.loc[df["topic"] == t, keep_cols]
            if len(sub):
                buffers[t].append(sub)

    # Concatenate topic-wise and sample exact target size
    sampled_parts = []
    rng = np.random.RandomState(random_state)
    for t in topics:
        if len(buffers[t]) == 0:
            continue
        pool = pd.concat(buffers[t], ignore_index=True)
        k = min(target[t], len(pool))
        if k == 0:
            continue
        sampled = pool.sample(n=k, random_state=rng)
        sampled_parts.append(sampled)

    sample_df = pd.concat(sampled_parts, ignore_index=True)
    # If for any reason we're short (e.g., some rare topic), top-up from the largest topic
    if len(sample_df) < n:
        short = n - len(sample_df)
        # find largest available topic to top-up
        topic_sizes = sample_df["topic"].value_counts()
        biggest_topic = topic_sizes.index[0]
        # read again and pull more from that topic not already in sample
        # (to keep it simple, we'll just accept being slightly short if truly impossible)
        print(f"Warning: sample short by {short} rows; could not fully meet per-topic targets.")

    sample_df.to_csv(out_csv, index=False)
    return sample_df

# -----------------------------
# Run the pipeline
# -----------------------------
if __name__ == "__main__":
    # 1) Combine CSVs → Parquet
    print("Combining CSVs → Parquet...")
    combine_csvs_to_parquet(INPUT_FILES, COMBINED_PARQUET, usecols=USECOLS, chunksize=CHUNKSIZE)

    # 2) Clean + Topic labeling → Parquet
    print("Cleaning text & assigning topics → Parquet...")
    clean_and_label_to_parquet(COMBINED_PARQUET, CLEANED_PARQUET, chunksize=CHUNKSIZE)

    # 3) Stratified sample to 100k
    print("Stratified sampling to 100,000 rows → CSV...")
    sample_df = proportional_stratified_sample(CLEANED_PARQUET, SAMPLE_CSV, n=100_000, random_state=RANDOM_STATE)

    print("Done.")
    print(sample_df["topic"].value_counts())


Combining CSVs → Parquet...
Cleaning text & assigning topics → Parquet...
Stratified sampling to 100,000 rows → CSV...
Done.
topic
other                   43274
compliment              16160
request_or_question     13146
emoji_only               7718
reply                    6595
beauty_and_hair          5779
humor_meme               4018
nationality_language     1980
critique                  796
religion                  276
spam_or_promo             258
Name: count, dtype: int64


Combining CSVs → Parquet...
Cleaning text & assigning topics → Parquet...
Stratified sampling to 100,000 rows → CSV...
Done.
topic
other                   43274
compliment              16160
request_or_question     13146
emoji_only               7718
reply                    6595
beauty_and_hair          5779
humor_meme               4018
nationality_language     1980
critique                  796
religion                  276
spam_or_promo             258
Name: count, dtype: int64

In [6]:
# FILE: nmf_topic_stratified_sampling.py
# Run with: python nmf_topic_stratified_sampling.py
import os
import re
import math
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# -----------------------
# Config - tune these
# -----------------------
INPUT_FILES = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv",
]
CLEANED_CHUNKS_DIR = "topic_chunks"          # will store chunked parquet files with topic labels
os.makedirs(CLEANED_CHUNKS_DIR, exist_ok=True)

SAMPLE_SIZE = 200_000        # size to train the NMF on (200k is a good balance)
N_TOPICS = 30               # number of topics for NMF
MAX_FEATURES = 20000         # vocabulary size for TF-IDF (memory control)
CHUNKSIZE = 200_000         # rows read per chunk from CSV (tune to your memory)
RANDOM_STATE = 42
FINAL_SAMPLE_SIZE = 100_000  # desired stratified sample size

# -----------------------
# Helpers
# -----------------------
url_re = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
ws_re = re.compile(r"\s+")

def clean_text_keep_emojis(s):
    """Light cleaning but keep emojis and non-ascii chars."""
    if not isinstance(s, str):
        return ""
    x = url_re.sub(" ", s)
    x = x.replace("\r", " ").replace("\n", " ")
    x = ws_re.sub(" ", x).strip()
    return x.lower()

# tokenizer that preserves emojis and punctuation tokens as separate tokens
def simple_tokenizer(text):
    # capture word tokens OR single non-space non-word characters (emojis/punct)
    # This will keep emojis as tokens
    return re.findall(r"\w+|[^\s\w]", text, flags=re.UNICODE)

# -----------------------
# Step 0: helper to stream CSVs as DataFrame chunks
# -----------------------
def csv_chunks(files, chunksize=100_000, usecols=None):
    """Yields tuples (df_chunk, source_filename, global_chunk_idx)."""
    idx = 0
    for fname in files:
        for chunk in pd.read_csv(fname, usecols=usecols, chunksize=chunksize, dtype=str):
            yield chunk, fname, idx
            idx += 1

# -----------------------
# Step 1: Build a training sample (SAMPLE_SIZE) by streaming
# -----------------------
def build_training_sample(files, sample_size=SAMPLE_SIZE, chunksize=CHUNKSIZE):
    rng = np.random.RandomState(RANDOM_STATE)
    sample_texts = []
    sample_ids = []
    total_seen = 0
    for chunk, fname, _ in csv_chunks(files, chunksize=chunksize, usecols=["commentId", "textOriginal"]):
        # clean text column
        chunk["text_clean"] = chunk["textOriginal"].apply(clean_text_keep_emojis)
        # reservoir-like sampling: append and then sample if we exceed
        sample_texts.extend(chunk["text_clean"].tolist())
        sample_ids.extend(chunk["commentId"].tolist())
        total_seen += len(chunk)
        # if collected many more than sample_size, downsample to sample_size
        if len(sample_texts) > sample_size * 1.5:
            chosen_idx = rng.choice(len(sample_texts), size=sample_size, replace=False)
            sample_texts = [sample_texts[i] for i in chosen_idx]
            sample_ids = [sample_ids[i] for i in chosen_idx]
        if len(sample_texts) >= sample_size:
            break

    # final trimming (random if more collected)
    if len(sample_texts) > sample_size:
        chosen_idx = rng.choice(len(sample_texts), size=sample_size, replace=False)
        sample_texts = [sample_texts[i] for i in chosen_idx]
        sample_ids = [sample_ids[i] for i in chosen_idx]

    sample_df = pd.DataFrame({"commentId": sample_ids, "text_clean": sample_texts})
    print(f"Built training sample: {len(sample_df)} rows (from ~{total_seen} seen).")
    return sample_df

# -----------------------
# Step 2: Fit TF-IDF vectorizer and NMF on the sample
# -----------------------
def fit_vectorizer_and_nmf(sample_df, n_topics=N_TOPICS, max_features=MAX_FEATURES):
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        tokenizer=simple_tokenizer,
        lowercase=True,
        stop_words="english"
    )
    print("Fitting TF-IDF vectorizer on sample...")
    dtm_sample = vectorizer.fit_transform(sample_df["text_clean"])
    print(f"DTM sample shape: {dtm_sample.shape}")

    print(f"Fitting NMF ({n_topics} topics)...")
    nmf = NMF(
        n_components=n_topics,
        random_state=RANDOM_STATE,
        init="nndsvda",
        max_iter=200,
        tol=1e-4
    )
    W = nmf.fit_transform(dtm_sample)  # W: (n_samples, n_topics)
    H = nmf.components_                 # H: (n_topics, n_features)
    print("NMF fit complete.")
    return vectorizer, nmf

# -----------------------
# Step 3: Apply model to all comments chunk-by-chunk and write per-chunk parquet with topic
# -----------------------
def apply_model_and_write_chunks(files, vectorizer, nmf, chunksize=CHUNKSIZE, out_dir=CLEANED_CHUNKS_DIR):
    chunk_idx = 0
    topic_counts = {}
    for chunk, fname, _ in tqdm(csv_chunks(files, chunksize=chunksize, usecols=None), desc="Processing chunks"):
        # ensure textOriginal exists
        if "textOriginal" not in chunk.columns:
            # try possible column names or skip
            if "text" in chunk.columns:
                chunk["textOriginal"] = chunk["text"]
            else:
                chunk["textOriginal"] = ""
        chunk["text_clean"] = chunk["textOriginal"].apply(clean_text_keep_emojis)

        # vectorize and transform to topic space
        dtm = vectorizer.transform(chunk["text_clean"])
        # nmf.transform solves for W (non-negative least squares) per document
        topic_dist = nmf.transform(dtm)          # shape (n_docs_in_chunk, n_topics)
        dominant_topic = np.argmax(topic_dist, axis=1)
        chunk["topic"] = dominant_topic.astype(int)

        # update running counts
        unique, counts = np.unique(dominant_topic, return_counts=True)
        for u, c in zip(unique, counts):
            topic_counts[int(u)] = topic_counts.get(int(u), 0) + int(c)

        # write chunk to parquet
        out_path = os.path.join(out_dir, f"chunk_{chunk_idx:05d}.parquet")
        # keep only necessary columns to save space
        keep_cols = [c for c in ["commentId", "channelId", "videoId", "authorId",
                                 "textOriginal", "text_clean", "parentCommentId",
                                 "likeCount", "publishedAt", "updatedAt", "topic"] if c in chunk.columns]
        chunk[keep_cols].to_parquet(out_path, index=False)
        chunk_idx += 1

    print("All chunks processed and saved.")
    return topic_counts, chunk_idx

# -----------------------
# Step 4: Proportional stratified sampling to exactly FINAL_SAMPLE_SIZE
# -----------------------
def stratified_sample_from_chunks(chunks_dir, topic_counts, total_docs, final_n=FINAL_SAMPLE_SIZE, random_state=RANDOM_STATE):
    # compute target allocation per topic
    topics = sorted(topic_counts.keys())
    proportions = {t: topic_counts[t] / total_docs for t in topics}
    target_raw = {t: proportions[t] * final_n for t in topics}
    target = {t: int(math.floor(target_raw[t])) for t in topics}
    deficit = final_n - sum(target.values())
    if deficit > 0:
        remainders = sorted(((t, target_raw[t] - target[t]) for t in topics), key=lambda x: x[1], reverse=True)
        for i in range(deficit):
            target[remainders[i % len(remainders)][0]] += 1

    print("Sampling targets per topic computed.")

    # Now iterate chunks and draw per-topic samples until targets filled
    rng = np.random.RandomState(random_state)
    buffers = {t: [] for t in topics}  # will hold dataframes
    filled = {t: 0 for t in topics}

    chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.parquet")))
    for pf in tqdm(chunk_files, desc="Sampling from chunks"):
        df = pd.read_parquet(pf)
        for t in topics:
            need = target[t] - filled[t]
            if need <= 0:
                continue
            sub = df.loc[df["topic"] == t]
            if len(sub) == 0:
                continue
            # if more rows than needed, sample
            take = sub.sample(n=min(need, len(sub)), random_state=rng)
            buffers[t].append(take)
            filled[t] += len(take)

        # quick exit if all filled
        if sum(filled.values()) >= final_n:
            break

    # concat results
    parts = []
    for t in topics:
        if buffers[t]:
            parts.append(pd.concat(buffers[t], ignore_index=True))

    sample_df = pd.concat(parts, ignore_index=True)
    # safety check: if slightly more (due to rounding), trim
    if len(sample_df) > final_n:
        sample_df = sample_df.sample(n=final_n, random_state=rng).reset_index(drop=True)

    print(f"Final sampled rows: {len(sample_df)}")
    return sample_df

# -----------------------
# Main pipeline
# -----------------------
def main():
    # 1) Build sample for training
    sample_df = build_training_sample(INPUT_FILES, sample_size=SAMPLE_SIZE, chunksize=CHUNKSIZE)

    # 2) Fit vectorizer + NMF on the sample
    vectorizer, nmf = fit_vectorizer_and_nmf(sample_df, n_topics=N_TOPICS, max_features=MAX_FEATURES)

    # 3) Apply the model to all comments in chunks and store chunked parquet files with 'topic' column
    topic_counts, n_chunks = apply_model_and_write_chunks(INPUT_FILES, vectorizer, nmf, chunksize=CHUNKSIZE, out_dir=CLEANED_CHUNKS_DIR)
    total_docs = sum(topic_counts.values())
    print(f"Total docs processed: {total_docs}, across {n_chunks} chunks.")
    print("Topic counts (sample):", topic_counts)

    # 4) Stratified sample exactly FINAL_SAMPLE_SIZE
    sample_df_final = stratified_sample_from_chunks(CLEANED_CHUNKS_DIR, topic_counts, total_docs, final_n=FINAL_SAMPLE_SIZE, random_state=RANDOM_STATE)
    sample_df_final.to_csv("comments_sample_100k_stratified.csv", index=False)
    print("Saved: comments_sample_100k_stratified.csv")

    # 5) Optional: save the NMF model / vectorizer using joblib for reuse
    try:
        import joblib
        joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
        joblib.dump(nmf, "nmf_model.joblib")
        print("Saved vectorizer and NMF model to disk (joblib).")
    except Exception as e:
        print("joblib save failed:", e)

if __name__ == "__main__":
    main()


Built training sample: 200000 rows (from ~200000 seen).
Fitting TF-IDF vectorizer on sample...




DTM sample shape: (200000, 20000)
Fitting NMF (30 topics)...
NMF fit complete.


Processing chunks: 24it [05:21, 13.41s/it]


All chunks processed and saved.
Total docs processed: 4725012, across 24 chunks.
Topic counts (sample): {0: 405422, 1: 275926, 2: 210693, 3: 190966, 4: 74528, 5: 144227, 6: 105778, 7: 68563, 8: 241746, 9: 65956, 10: 211182, 11: 103846, 12: 50468, 13: 108922, 14: 93366, 15: 82219, 16: 99540, 17: 52157, 18: 277952, 19: 44697, 20: 101607, 21: 159245, 22: 292234, 23: 415064, 24: 67276, 25: 126540, 26: 342162, 27: 131093, 28: 37153, 29: 144484}
Sampling targets per topic computed.


Sampling from chunks:   0%|                                                                     | 0/24 [00:02<?, ?it/s]


Final sampled rows: 100000
Saved: comments_sample_100k_stratified.csv
Saved vectorizer and NMF model to disk (joblib).


In [8]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading transformers-4.56.1-py3-none-any.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.6 MB 5.9 MB/s eta 0:00:02
   ------- -------------------------------- 2.1/11.6 MB 5.7 MB/s eta 0:00:02
   ------------- -------------------------- 3.9/11.6 MB 6.9 MB/s eta 0:00:02
   -------------------- ------------------- 6.0/11.6 MB 7.5 MB/s eta 0:00:01
   -------------------------- ------------- 7.6/11.6 MB 7.8 MB/s eta 0:00:01
   

In [9]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.model_selection import train_test_split
import re

# -----------------------------
# 1. Load data (assuming CSV)
# -----------------------------
df = pd.read_csv("comments.csv")   # replace with your merged dataset
print("Original dataset:", len(df))

# -----------------------------
# 2. Randomly choose 100k
# -----------------------------
df_sample = df.sample(n=100000, random_state=42)
print("Random sample size:", len(df_sample))

# -----------------------------
# 3. Zero-Shot Classification
# -----------------------------
# Load Hugging Face zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Candidate labels to try — feel free to expand this list
candidate_labels = ["gaming", "music", "politics", "technology", "sports", 
                    "movies", "tv", "education", "food", "travel", "health"]

def classify_comment(text):
    try:
        result = classifier(text, candidate_labels)
        return result["labels"][0]   # take top predicted category
    except:
        return "other"

# Apply to 100k sample (⚠️ slow without GPU, batch it if needed)
df_sample["predicted_category"] = df_sample["text"].astype(str).apply(classify_comment)

# -----------------------------
# 4. Extract categories & keyword match
# -----------------------------
categories = df_sample["predicted_category"].unique().tolist()
print("Discovered categories:", categories)

# Build keyword dictionary from categories
keyword_dict = {cat: [cat] for cat in categories}   # simplest mapping

def keyword_match(text):
    text = str(text).lower()
    for cat, keywords in keyword_dict.items():
        for kw in keywords:
            if re.search(rf"\b{kw}\b", text):
                return cat
    return "other"

# Apply keyword matching to all 5M
df["category"] = df["text"].astype(str).apply(keyword_match)

# -----------------------------
# 5. Stratified Sampling
# -----------------------------
df_stratified, _ = train_test_split(
    df, 
    train_size=100000, 
    stratify=df["category"], 
    random_state=42
)

print("Final stratified sample size:", len(df_stratified))

# -----------------------------
# 6. Save results
# -----------------------------
df_stratified.to_csv("CORRECTONE!!!stratified_sample_100k.csv", index=False)
df.to_parquet("comments_with_categories.parquet", engine="pyarrow", index=False)

print("✅ Done! Categories assigned & stratified sample created.")


KeyboardInterrupt: 

In [2]:
import pandas as pd
import glob
from transformers import pipeline
import random

# ---------------------------
# 1. Combine CSV files into one BIG BOY
# ---------------------------
# Replace 'your_folder/*.csv' with your actual folder path
files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

df_list = [pd.read_csv(f) for f in files]
BIG_BOY = pd.concat(df_list, ignore_index=True)
print("BIG BOY shape:", BIG_BOY.shape)

# ---------------------------
# 2. Initialize Zero-Shot Classifier
# ---------------------------
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define some candidate labels (categories you want to detect)
candidate_labels = ["politics", "gaming", "sports", "technology", "music", 
                    "education", "health", "entertainment", "finance", "other"]

# ---------------------------
# 3. Sample 20k three times and classify
# ---------------------------
def run_zero_shot(sample_df, run_name):
    results = []
    for comment in sample_df["comment_text"].tolist():
        try:
            res = classifier(comment, candidate_labels)
            results.append({
                "text": comment,
                "label": res["labels"][0],  # top predicted category
                "scores": res["scores"][0]
            })
        except:
            results.append({
                "text": comment,
                "label": "error",
                "scores": None
            })
    return pd.DataFrame(results)

# First 20k sample
sample1 = BIG_BOY.sample(20000, random_state=42)
classified1 = run_zero_shot(sample1, "run1")

# Second 20k sample
sample2 = BIG_BOY.sample(20000, random_state=123)
classified2 = run_zero_shot(sample2, "run2")

# Third 20k sample
sample3 = BIG_BOY.sample(20000, random_state=999)
classified3 = run_zero_shot(sample3, "run3")

# ---------------------------
# 4. Save results
# ---------------------------
classified1.to_csv("classified_run1.csv", index=False)
classified2.to_csv("classified_run2.csv", index=False)
classified3.to_csv("classified_run3.csv", index=False)

print("Done! Classified three random 20k samples.")


KeyboardInterrupt: 

In [3]:
NMF_sample = pd.read_csv("comments_sample_100k_stratified.csv")
NMF_sample.shape

(100000, 11)

In [4]:
NMF_sample.columns

Index(['commentId', 'channelId', 'videoId', 'authorId', 'textOriginal',
       'text_clean', 'parentCommentId', 'likeCount', 'publishedAt',
       'updatedAt', 'topic'],
      dtype='object')