## COMBINING DATASET 

In [6]:
import pandas as pd

# List of CSV files
csv_files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

# Read and combine into one dataframe
df_list = [pd.read_csv(file) for file in csv_files]
big_boy = pd.concat(df_list, ignore_index=True)

# Save the combined dataset
big_boy.to_csv("BIG_BOY.csv", index=False)

print("✅ Combined dataset saved as BIG_BOY.csv")
print("Shape of BIG BOY:", big_boy.shape)


✅ Combined dataset saved as BIG_BOY.csv
Shape of BIG BOY: (4725012, 10)


In [7]:
# Randomly choose 20k rows
sample_100k = big_boy.sample(n=100000, random_state=42)  # random_state makes it reproducible

print("✅ Random sample of 100,000 created")
print("Shape of sample:", sample_100k.shape)


✅ Random sample of 100,000 created
Shape of sample: (100000, 10)


In [8]:
sample_100k.head

<bound method NDFrame.head of                     kind  commentId  channelId  videoId  authorId  \
3956772  youtube#comment    1999899      33069    36229   2377260   
3284632  youtube#comment    1235288      37371     5214   2495217   
1185839  youtube#comment    1120879      18073    76480   2689034   
238094   youtube#comment    3806774      46757    78060   3531698   
1107951  youtube#comment    1874941      33445    91263    968927   
...                  ...        ...        ...      ...       ...   
811650   youtube#comment    2788176      41246    87323   2446840   
32900    youtube#comment     668979        805    18615   3561486   
2355010  youtube#comment    4642725      33445    91263   3547007   
4197186  youtube#comment     365684      11679      743   1946253   
1421695  youtube#comment    2350664      16876    18101   3254686   

                                              textOriginal  parentCommentId  \
3956772  💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙💙...    

pipline of whats about to go down

- combine dataset to bigboy ✅
- randomly select 100k rows for a sample1 and preprocess (remove emoji, anything not in english, any non english words) ✅
- run NMF on the sample1 to get categories
- go back to bigboy and create categories column and do keyword-matching to sort the data
- make a sample of 100k using stratified sampling
- pass it onto dalia

## TEXT CLEANING

In [9]:
! pip install emoji



In [10]:
! pip install langdetect



In [11]:
import pandas as pd
import re
import string
import emoji
from langdetect import detect, DetectorFactory

# Fix seed for langdetect reproducibility
DetectorFactory.seed = 0

# Load CSV
df = sample_100k

# Function: remove emojis
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Function: clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = remove_emojis(text)
    text = text.lower()
    # Remove punctuation & symbols
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Function: keep only English comments
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

# Apply cleaning
df["clean_text"] = df["textOriginal"].apply(clean_text)

# Filter English only
df = df[df["clean_text"].apply(is_english)]

# Reset index
df = df.reset_index(drop=True)

print(df[["textOriginal", "clean_text"]].head())


                                        textOriginal  \
0  Reasons why heavily attractive people (you💗) d...   
1             "hmm what's next" \n* hair falls out *   
2                          You are so pretty gurl ☺️   
3                                    So beautiful ❤❤   
4  It doesn't seem like Micheal what's to admit t...   

                                          clean_text  
0  reasons why heavily attractive people you don’...  
1                     hmm what s next hair falls out  
2                             you are so pretty gurl  
3                                       so beautiful  
4  it doesn t seem like micheal what s to admit t...  


## NMF MODEL TO GET TOPICS

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Assuming df already has the 'clean_text' column from preprocessing
texts = df["clean_text"].dropna().tolist()

# Step 1: TF-IDF vectorization
vectorizer = TfidfVectorizer(
    max_df=0.95,       # ignore very common words
    min_df=2,          # ignore very rare words
    stop_words="english"  # remove English stopwords
)
tfidf = vectorizer.fit_transform(texts)

# Step 2: NMF Model
n_topics = 10
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_model.fit(tfidf)

# Step 3: Display topics
def display_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx+1}: {', '.join(top_features)}")

# Print topics
tfidf_feature_names = vectorizer.get_feature_names_out()
display_topics(nmf_model, tfidf_feature_names, n_top_words=10)


Topic 1: beautiful, way, wow, looking, night, absolutely, bald, naturally, matter, india
Topic 2: thank, share, dear, video, aww, watching, lets, lovvvvvve, 100million, hit
Topic 3: hair, curly, long, straight, cut, black, natural, style, short, wavy
Topic 4: makeup, wear, better, need, indian, best, gorgeous, nice, india, wearing
Topic 5: love, india, videos, omg, natural, video, ginger, song, ur, best
Topic 6: pretty, omg, ur, really, way, girl, wow, looks, wig, looking
Topic 7: look, good, night, gorgeous, amazing, better, great, bald, does, day
Topic 8: thanks, video, share, dear, sharing, watching, lot, wow, great, tips
Topic 9: like, don, just, looks, girl, make, way, people, know, good
Topic 10: cute, wow, looking, omg, looks, really, voice, boy, girl, soo


## CREATING CATEGORIES COLUMN IN BIGBOY

In [13]:
#that we got from NMF model
categories = {
    "appearance_compliments": ["beautiful", "wow", "looking", "night", "absolutely", "bald", "naturally"],
    "gratitude_milestones": ["thank", "share", "dear", "aww", "100million", "hit", "watching", "lets"],
    "hair": ["hair", "curly", "long", "straight", "cut", "black", "style", "short", "wavy"],
    "makeup_beauty": ["makeup", "wear", "indian", "best", "gorgeous", "nice", "wearing"],
    "love_india_music": ["love", "india", "videos", "omg", "natural", "song", "ginger"],
    "looks_wigs": ["pretty", "omg", "ur", "wig", "looks", "really", "wow"],
    "general_compliments": ["look", "good", "night", "gorgeous", "amazing", "better", "great", "day"],
    "thanks_tips": ["thanks", "share", "dear", "sharing", "tips", "watching"],
    "opinions_casual": ["like", "don", "just", "make", "way", "people", "know"],
    "cuteness_voice": ["cute", "wow", "voice", "boy", "girl", "soo"]
}


In [14]:
import pandas as pd
import re

# Example: loading big dataset (replace with your real file)
df = big_boy  # contains column 'textOriginal'

# Function: match comment to a category
def categorize_comment(text, categories):
    text = str(text).lower()
    for cat, keywords in categories.items():
        # simple keyword match
        if any(re.search(rf"\b{kw}\b", text) for kw in keywords):
            return cat
    return "other"  # if no keyword matches

# Apply to dataset
df["category"] = df["textOriginal"].apply(lambda x: categorize_comment(x, categories))

# Check distribution
print(df["category"].value_counts())


category
other                     2505390
appearance_compliments     459707
makeup_beauty              361002
hair                       270684
love_india_music           260189
opinions_casual            256926
general_compliments        188768
looks_wigs                 180037
gratitude_milestones       115690
cuteness_voice             101559
thanks_tips                 25060
Name: count, dtype: int64


In [15]:
! pip install flashtext



In [16]:
from flashtext import KeywordProcessor

# Build processor
keyword_processor = KeywordProcessor()
for cat, keywords in categories.items():
    for kw in keywords:
        keyword_processor.add_keyword(kw, cat)

# Match function
def fast_categorize(text):
    matches = keyword_processor.extract_keywords(str(text).lower())
    return matches[0] if matches else "other"

# Apply fast
df["category"] = df["textOriginal"].apply(fast_categorize)


## STRATIFIED RANDOM SAMPLING TO GET SAMPLE OF 100K

In [17]:
import pandas as pd

# Load your big dataset (already has "category" column)

# Desired sample size
sample_size = 100_000  

# Compute sampling fraction per category
fractions = sample_size / len(big_boy)

# Stratified sample
sample4model = big_boy.groupby("category", group_keys=False).apply(
    lambda x: x.sample(frac=fractions, random_state=42)
)

# If rounding causes slight mismatch, fix final size
sample4model  = sample4model .sample(n=sample_size, random_state=42)

# Save or inspect
sample4model .to_csv("sample_100k_stratified.csv", index=False)
print(sample4model ["category"].value_counts(normalize=True))


  sample4model = big_boy.groupby("category", group_keys=False).apply(


category
other                     0.53010
opinions_casual           0.08861
general_compliments       0.06315
makeup_beauty             0.05317
love_india_music          0.05273
looks_wigs                0.05133
cuteness_voice            0.04781
appearance_compliments    0.04627
hair                      0.04026
gratitude_milestones      0.01535
thanks_tips               0.01122
Name: proportion, dtype: float64


In [18]:
sample = pd.read_csv("sample_100k_stratified.csv")
sample.shape

(100000, 11)

## GETTING VALIDATION SET OF 200

In [None]:
# From the big dataset (with 'category')
val_size = 200
fractions = val_size / len(df)

# Stratified sample
df_val = df.groupby("category", group_keys=False).apply(
    lambda x: x.sample(frac=fractions, random_state=123)
)

# Fix exact size (200 rows)
df_val = df_val.sample(n=val_size, random_state=123).reset_index(drop=True)

# Save to CSV for manual labeling
df_val.to_csv("validation_sample_200.csv", index=False)

print(df_val["category"].value_counts())


## WOOHOOO READY FOR MODEL