In [8]:
import pandas as pd

# List of CSV files
csv_files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

# Read and combine into one dataframe
df_list = [pd.read_csv(file) for file in csv_files]
big_boy = pd.concat(df_list, ignore_index=True)

# Save the combined dataset
big_boy.to_csv("BIG_BOY.csv", index=False)

print("✅ Combined dataset saved as BIG_BOY.csv")
print("Shape of BIG BOY:", big_boy.shape)


✅ Combined dataset saved as BIG_BOY.csv
Shape of BIG BOY: (4725012, 10)


In [10]:
# Randomly choose 20k rows
sample_100k = big_boy.sample(n=100000, random_state=42)  # random_state makes it reproducible

print("✅ Random sample of 100,000 created")
print("Shape of sample:", sample_100k.shape)


✅ Random sample of 100,000 created
Shape of sample: (100000, 10)


In [17]:
sample_100k.head

<bound method NDFrame.head of                     kind  commentId  channelId  videoId  authorId  \
3956772  youtube#comment    1999899      33069    36229   2377260   
3284632  youtube#comment    1235288      37371     5214   2495217   
1185839  youtube#comment    1120879      18073    76480   2689034   
238094   youtube#comment    3806774      46757    78060   3531698   
1107951  youtube#comment    1874941      33445    91263    968927   
...                  ...        ...        ...      ...       ...   
811650   youtube#comment    2788176      41246    87323   2446840   
32900    youtube#comment     668979        805    18615   3561486   
2355010  youtube#comment    4642725      33445    91263   3547007   
4197186  youtube#comment     365684      11679      743   1946253   
1421695  youtube#comment    2350664      16876    18101   3254686   

                                              textOriginal  parentCommentId  \
3956772  💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙...    

In [13]:
import nltk

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rayap\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords

# Load your 100k sample
df = sample_100k  # adjust filename

# English stopwords (download if needed: nltk.download("stopwords"))
stop_words = set(stopwords.words("english"))

def clean_text(text):
    # Lowercase
    text = str(text).lower()
    
    # Remove emojis and non-ASCII characters
    text = text.encode("ascii", "ignore").decode()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove digits
    text = re.sub(r"\d+", "", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

# Apply cleaning to the "comment" column (adjust column name if needed)
df["clean_comment"] = df["textOriginal"].apply(clean_text)

# Save cleaned version
df.to_csv("sample_100k_clean.csv", index=False)

print("✅ Preprocessing complete. Cleaned text saved in 'clean_comment' column.")


✅ Preprocessing complete. Cleaned text saved in 'clean_comment' column.


In [24]:
CLEANsample100k = pd.read_csv("sample_100k_clean.csv")
CLEANsample100k['clean_comment'].head(10)

0                                                  NaN
1    reasons heavily attractive people dont get ask...
2                                                  NaN
3                            hmm whats next hair falls
4                                          pretty gurl
5                                         best ad ever
6                                            beautiful
7    doesnt seem like micheal whats admit fault wom...
8                          larga de ser ridcula mulher
9                              rupey kat overacting ke
Name: clean_comment, dtype: object

In [1]:
# FILE: nmf_topic_stratified_sampling.py
# Run with: python nmf_topic_stratified_sampling.py
import os
import re
import math
import glob
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# -----------------------
# Config - tune these
# -----------------------
INPUT_FILES = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv",
]
CLEANED_CHUNKS_DIR = "topic_chunks"          
os.makedirs(CLEANED_CHUNKS_DIR, exist_ok=True)

SAMPLE_SIZE = 200_000        
N_TOPICS = 30               
MAX_FEATURES = 20000         
CHUNKSIZE = 200_000         
RANDOM_STATE = 42
FINAL_SAMPLE_SIZE = 100_000  

# -----------------------
# Helpers
# -----------------------
url_re = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
ws_re = re.compile(r"\s+")

def clean_text_keep_emojis(s):
    if not isinstance(s, str):
        return ""
    x = url_re.sub(" ", s)
    x = x.replace("\r", " ").replace("\n", " ")
    x = ws_re.sub(" ", x).strip()
    return x.lower()

def simple_tokenizer(text):
    return re.findall(r"\w+|[^\s\w]", text, flags=re.UNICODE)

def csv_chunks(files, chunksize=100_000, usecols=None):
    idx = 0
    for fname in files:
        for chunk in pd.read_csv(fname, usecols=usecols, chunksize=chunksize, dtype=str):
            yield chunk, fname, idx
            idx += 1

def build_training_sample(files, sample_size=SAMPLE_SIZE, chunksize=CHUNKSIZE):
    rng = np.random.RandomState(RANDOM_STATE)
    sample_texts = []
    sample_ids = []
    total_seen = 0
    for chunk, fname, _ in csv_chunks(files, chunksize=chunksize, usecols=["commentId", "textOriginal"]):
        chunk["text_clean"] = chunk["textOriginal"].apply(clean_text_keep_emojis)
        sample_texts.extend(chunk["text_clean"].tolist())
        sample_ids.extend(chunk["commentId"].tolist())
        total_seen += len(chunk)
        if len(sample_texts) > sample_size * 1.5:
            chosen_idx = rng.choice(len(sample_texts), size=sample_size, replace=False)
            sample_texts = [sample_texts[i] for i in chosen_idx]
            sample_ids = [sample_ids[i] for i in chosen_idx]
        if len(sample_texts) >= sample_size:
            break
    if len(sample_texts) > sample_size:
        chosen_idx = rng.choice(len(sample_texts), size=sample_size, replace=False)
        sample_texts = [sample_texts[i] for i in chosen_idx]
        sample_ids = [sample_ids[i] for i in chosen_idx]
    sample_df = pd.DataFrame({"commentId": sample_ids, "text_clean": sample_texts})
    print(f"Built training sample: {len(sample_df)} rows (from ~{total_seen} seen).")
    return sample_df

def fit_vectorizer_and_nmf(sample_df, n_topics=N_TOPICS, max_features=MAX_FEATURES, n_top_words=15):
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        tokenizer=simple_tokenizer,
        lowercase=True,
        stop_words="english"
    )
    print("Fitting TF-IDF vectorizer on sample...")
    dtm_sample = vectorizer.fit_transform(sample_df["text_clean"])
    print(f"DTM sample shape: {dtm_sample.shape}")

    print(f"Fitting NMF ({n_topics} topics)...")
    nmf = NMF(
        n_components=n_topics,
        random_state=RANDOM_STATE,
        init="nndsvda",
        max_iter=200,
        tol=1e-4
    )
    W = nmf.fit_transform(dtm_sample)  
    H = nmf.components_                
    print("NMF fit complete.")

    # Print top words for each topic
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    print("\n--- Top words per topic ---")
    for topic_idx, topic in enumerate(H):
        top_indices = topic.argsort()[::-1][:n_top_words]
        top_words = [feature_names[i] for i in top_indices]
        topic_keywords[topic_idx] = top_words
        print(f"Topic {topic_idx}: {' | '.join(top_words)}")
    print("---------------------------\n")

    return vectorizer, nmf, topic_keywords

def apply_model_and_write_chunks(files, vectorizer, nmf, chunksize=CHUNKSIZE, out_dir=CLEANED_CHUNKS_DIR):
    chunk_idx = 0
    topic_counts = {}
    for chunk, fname, _ in tqdm(csv_chunks(files, chunksize=chunksize, usecols=None), desc="Processing chunks"):
        if "textOriginal" not in chunk.columns:
            if "text" in chunk.columns:
                chunk["textOriginal"] = chunk["text"]
            else:
                chunk["textOriginal"] = ""
        chunk["text_clean"] = chunk["textOriginal"].apply(clean_text_keep_emojis)
        dtm = vectorizer.transform(chunk["text_clean"])
        topic_dist = nmf.transform(dtm)          
        dominant_topic = np.argmax(topic_dist, axis=1)
        chunk["topic"] = dominant_topic.astype(int)

        unique, counts = np.unique(dominant_topic, return_counts=True)
        for u, c in zip(unique, counts):
            topic_counts[int(u)] = topic_counts.get(int(u), 0) + int(c)

        out_path = os.path.join(out_dir, f"chunk_{chunk_idx:05d}.parquet")
        keep_cols = [c for c in ["commentId", "channelId", "videoId", "authorId",
                                 "textOriginal", "text_clean", "parentCommentId",
                                 "likeCount", "publishedAt", "updatedAt", "topic"] if c in chunk.columns]
        chunk[keep_cols].to_parquet(out_path, index=False)
        chunk_idx += 1

    print("All chunks processed and saved.")
    return topic_counts, chunk_idx

def stratified_sample_from_chunks(chunks_dir, topic_counts, total_docs, topic_keywords,
                                  final_n=FINAL_SAMPLE_SIZE, random_state=RANDOM_STATE):
    topics = sorted(topic_counts.keys())
    proportions = {t: topic_counts[t] / total_docs for t in topics}
    target_raw = {t: proportions[t] * final_n for t in topics}
    target = {t: int(math.floor(target_raw[t])) for t in topics}
    deficit = final_n - sum(target.values())
    if deficit > 0:
        remainders = sorted(((t, target_raw[t] - target[t]) for t in topics), key=lambda x: x[1], reverse=True)
        for i in range(deficit):
            target[remainders[i % len(remainders)][0]] += 1

    print("Sampling targets per topic computed.")

    rng = np.random.RandomState(random_state)
    buffers = {t: [] for t in topics}
    filled = {t: 0 for t in topics}

    chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.parquet")))
    for pf in tqdm(chunk_files, desc="Sampling from chunks"):
        df = pd.read_parquet(pf)
        for t in topics:
            need = target[t] - filled[t]
            if need <= 0:
                continue
            sub = df.loc[df["topic"] == t]
            if len(sub) == 0:
                continue
            take = sub.sample(n=min(need, len(sub)), random_state=rng)
            buffers[t].append(take)
            filled[t] += len(take)
        if sum(filled.values()) >= final_n:
            break

    parts = []
    for t in topics:
        if buffers[t]:
            df_topic = pd.concat(buffers[t], ignore_index=True)
            df_topic["category"] = ", ".join(topic_keywords[t][:5])  # category = top 5 keywords
            parts.append(df_topic)

    sample_df = pd.concat(parts, ignore_index=True)
    if len(sample_df) > final_n:
        sample_df = sample_df.sample(n=final_n, random_state=rng).reset_index(drop=True)

    print(f"Final sampled rows: {len(sample_df)}")
    return sample_df

def main():
    sample_df = build_training_sample(INPUT_FILES, sample_size=SAMPLE_SIZE, chunksize=CHUNKSIZE)
    vectorizer, nmf, topic_keywords = fit_vectorizer_and_nmf(sample_df, n_topics=N_TOPICS, max_features=MAX_FEATURES)

    topic_counts, n_chunks = apply_model_and_write_chunks(INPUT_FILES, vectorizer, nmf, chunksize=CHUNKSIZE, out_dir=CLEANED_CHUNKS_DIR)
    total_docs = sum(topic_counts.values())
    print(f"Total docs processed: {total_docs}, across {n_chunks} chunks.")
    print("Topic counts (sample):", topic_counts)

    sample_df_final = stratified_sample_from_chunks(CLEANED_CHUNKS_DIR, topic_counts, total_docs, topic_keywords,
                                                    final_n=FINAL_SAMPLE_SIZE, random_state=RANDOM_STATE)
    sample_df_final.to_csv("comments_sample_100k_stratified.csv", index=False)
    print("Saved: comments_sample_100k_stratified.csv")

    try:
        import joblib
        joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
        joblib.dump(nmf, "nmf_model.joblib")
        print("Saved vectorizer and NMF model to disk (joblib).")
    except Exception as e:
        print("joblib save failed:", e)

if __name__ == "__main__":
    main()


Built training sample: 200000 rows (from ~200000 seen).
Fitting TF-IDF vectorizer on sample...




DTM sample shape: (200000, 20000)
Fitting NMF (30 topics)...
NMF fit complete.

--- Top words per topic ---
Topic 0: ❤ | muslim | gorgeous | hindu | ginger | amazing | bald | best | belle | stunning | 😢 | mashallah | ️ | perfect | africa
Topic 1: . | just | make | skin | people | women | video | face | " | really | … | time | beauty | want | great
Topic 2: ! | thank | gorgeous | amazing | stunning | omg | great | looks | absolutely | thanks | yes | good | did | oh | :
Topic 3: 😂 | 😅 | 🤣 | 😢 | lol | hai | said | bro | ho | funny | face | brother | bhi | ki | h
Topic 4: beautiful | way | looking | just | u | looks | make | 🤩 | ur | bald | skin | absolutely | woman | eyes | wig
Topic 5: ' | s | t | don | m | " | just | doesn | know | need | didn | people | think | way | ve
Topic 6: 😊 | thank | thanks | 😅 | happy | 😢 | bald | muslim | hindu | hi | good | ginger | ☺ | welcome | video
Topic 7: 😍 | 😘 | gorgeous | 🤩 | 💖 | looking | amazing | 👌 | ✨ | stunning | 🔥 | share | video | 💕 | 🤗
Topic 8

Processing chunks: 24it [05:01, 12.58s/it]


All chunks processed and saved.
Total docs processed: 4725012, across 24 chunks.
Topic counts (sample): {0: 405422, 1: 275926, 2: 210693, 3: 190966, 4: 74528, 5: 144227, 6: 105778, 7: 68563, 8: 241746, 9: 65956, 10: 211182, 11: 103846, 12: 50468, 13: 108922, 14: 93366, 15: 82219, 16: 99540, 17: 52157, 18: 277952, 19: 44697, 20: 101607, 21: 159245, 22: 292234, 23: 415064, 24: 67276, 25: 126540, 26: 342162, 27: 131093, 28: 37153, 29: 144484}
Sampling targets per topic computed.


Sampling from chunks:   0%|                                                                     | 0/24 [00:01<?, ?it/s]


Final sampled rows: 100000
Saved: comments_sample_100k_stratified.csv
Saved vectorizer and NMF model to disk (joblib).


In [2]:
NMF_sample = pd.read_csv("comments_sample_100k_stratified.csv")
NMF_sample.shape

(100000, 12)

In [3]:
NMF_sample.columns

Index(['commentId', 'channelId', 'videoId', 'authorId', 'textOriginal',
       'text_clean', 'parentCommentId', 'likeCount', 'publishedAt',
       'updatedAt', 'topic', 'category'],
      dtype='object')

In [7]:
NMF_sample['category'].unique()

array(['❤, muslim, gorgeous, hindu, ginger',
       '., just, make, skin, people',
       '!, thank, gorgeous, amazing, stunning', '😂, 😅, 🤣, 😢, lol',
       'beautiful, way, looking, just, u', "', s, t, don, m",
       '😊, thank, thanks, 😅, happy', '😍, 😘, gorgeous, 🤩, 💖',
       '?, did, u, song, use', '💜, 💙, bts, army, purple',
       '️, ♥, ☺, \u200d, thank', 'love, u, videos, pakistan, omg',
       'nice, video, 👌, looking, di', '@, \u200b, -, yes, thank',
       'look, night, good, u, day', 'india, best, 🇮, 🇳, pakistan',
       'pretty, ur, omg, u, way', 'wow, amazing, 🤩, 👌, gorgeous',
       '’, s, t, don, m', 'cute, omg, boy, looking, voice',
       '😮, 😢, 😅, omg, 😱', 'hair, curly, long, 😭, straight',
       'makeup, need, good, 💄, korean', ',, just, ", -, make',
       '🎉, 😢, 😅, happy, birthday', '🥰, thank, 😘, 🙏, share',
       'like, looks, just, u, looking', '👍, good, 👌, 🏻, looks',
       'indian, best, 🇳, 🇮, make', 'girl, happy, u, angry, boy'],
      dtype=object)