## COMBINING DATASET 

In [None]:
! pip install polars

In [7]:
import polars as pl
import pandas as pd

# List of CSV files
csv_files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

# Read and combine into one dataframe
df_list = [pl.read_csv(file) for file in csv_files]
big_boy = pl.concat(df_list, how="vertical")

# Save the combined dataset
big_boy.write_csv("BIG_BOY.csv")

print("✅ Combined dataset saved as BIG_BOY.csv")
print("Shape of BIG BOY:", big_boy.shape)


✅ Combined dataset saved as BIG_BOY.csv
Shape of BIG BOY: (4725012, 10)


In [9]:
# Randomly choose 20k rows
sample_100k = big_boy.sample(n=100000)  # random_state makes it reproducible

print("✅ Random sample of 100,000 created")
print("Shape of sample:", sample_100k.shape)


✅ Random sample of 100,000 created
Shape of sample: (100000, 10)


In [10]:
sample_100k.head

<bound method DataFrame.head of shape: (100_000, 10)
┌─────────────────┬───────────┬───────────┬─────────┬───┬────────────────┬───────────┬────────────────┬────────────────┐
│ kind            ┆ commentId ┆ channelId ┆ videoId ┆ … ┆ parentCommentI ┆ likeCount ┆ publishedAt    ┆ updatedAt      │
│ ---             ┆ ---       ┆ ---       ┆ ---     ┆   ┆ d              ┆ ---       ┆ ---            ┆ ---            │
│ str             ┆ i64       ┆ i64       ┆ i64     ┆   ┆ ---            ┆ i64       ┆ str            ┆ str            │
│                 ┆           ┆           ┆         ┆   ┆ i64            ┆           ┆                ┆                │
╞═════════════════╪═══════════╪═══════════╪═════════╪═══╪════════════════╪═══════════╪════════════════╪════════════════╡
│ youtube#comment ┆ 4190194   ┆ 27537     ┆ 28697   ┆ … ┆ null           ┆ 0         ┆ 2023-12-08     ┆ 2023-12-08     │
│                 ┆           ┆           ┆         ┆   ┆                ┆           ┆ 02:11:52+00:0

pipline of whats about to go down

- combine dataset to bigboy ✅
- randomly select 100k rows for a sample1 and preprocess (remove emoji, anything not in english, any non english words) ✅
- run NMF on the sample1 to get categories
- go back to bigboy and create categories column and do keyword-matching to sort the data
- make a sample of 100k using stratified sampling
- pass it onto dalia

## TEXT CLEANING

In [None]:
! pip install emoji

In [None]:
! pip install langdetect

In [None]:
import polars as pl
import re
import string
import emoji
from langdetect import detect, DetectorFactory

# Fix seed for langdetect reproducibility
DetectorFactory.seed = 0

# Use your existing sample_100k Polars DataFrame
df = sample_100k

# Function: remove emojis
def remove_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace='')

# Function: clean text
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = remove_emojis(text)
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

# Function: keep only English comments
def is_english(text: str) -> bool:
    try:
        return detect(text) == "en"
    except:
        return False

# Apply cleaning with map_elements
df = df.with_columns(
    df["textOriginal"].map_elements(clean_text, return_dtype=pl.Utf8).alias("clean_text")
)

# Filter English only
df = df.filter(
    df["clean_text"].map_elements(is_english, return_dtype=pl.Boolean)
)

print(df.select(["textOriginal", "clean_text"]).head())


## NMF MODEL TO GET TOPICS

In [None]:
import polars as pl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Assuming df already has the 'clean_text' column from preprocessing
texts = df["clean_text"].drop_nulls().tolist()

# Step 1: TF-IDF vectorization
vectorizer = TfidfVectorizer(
    max_df=0.95,       # ignore very common words
    min_df=2,          # ignore very rare words
    stop_words="english"  # remove English stopwords
)
tfidf = vectorizer.fit_transform(texts)

# Step 2: NMF Model
n_topics = 10
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_model.fit(tfidf)

# Step 3: Display topics
def display_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx+1}: {', '.join(top_features)}")

# Print topics
tfidf_feature_names = vectorizer.get_feature_names_out()
display_topics(nmf_model, tfidf_feature_names, n_top_words=10)


## CREATING CATEGORIES COLUMN IN BIGBOY

In [None]:
#that we got from NMF model
categories = {
    "appearance_compliments": ["beautiful", "wow", "looking", "night", "absolutely", "bald", "naturally"],
    "gratitude_milestones": ["thank", "share", "dear", "aww", "100million", "hit", "watching", "lets"],
    "hair": ["hair", "curly", "long", "straight", "cut", "black", "style", "short", "wavy"],
    "makeup_beauty": ["makeup", "wear", "indian", "best", "gorgeous", "nice", "wearing"],
    "love_india_music": ["love", "india", "videos", "omg", "natural", "song", "ginger"],
    "looks_wigs": ["pretty", "omg", "ur", "wig", "looks", "really", "wow"],
    "general_compliments": ["look", "good", "night", "gorgeous", "amazing", "better", "great", "day"],
    "thanks_tips": ["thanks", "share", "dear", "sharing", "tips", "watching"],
    "opinions_casual": ["like", "don", "just", "make", "way", "people", "know"],
    "cuteness_voice": ["cute", "wow", "voice", "boy", "girl", "soo"]
}


In [None]:
import polars as pl
import re

# Example: loading big dataset (replace with your real file)
df = big_boy  # contains column 'textOriginal'

# Function: match comment to a category
def categorize_comment(text, categories):
    text = str(text).lower()
    for cat, keywords in categories.items():
        # simple keyword match
        if any(re.search(rf"\b{kw}\b", text) for kw in keywords):
            return cat
    return "other"  # if no keyword matches

# Apply to dataset
df["category"] = df["textOriginal"].apply(lambda x: categorize_comment(x, categories))

# Check distribution
print(df["category"].value_counts())


In [None]:
! pip install flashtext

In [None]:
from flashtext import KeywordProcessor

# Build processor
keyword_processor = KeywordProcessor()
for cat, keywords in categories.items():
    for kw in keywords:
        keyword_processor.add_keyword(kw, cat)

# Match function
def fast_categorize(text):
    matches = keyword_processor.extract_keywords(str(text).lower())
    return matches[0] if matches else "other"

# Apply fast
df["category"] = df["textOriginal"].apply(fast_categorize)


## STRATIFIED RANDOM SAMPLING TO GET SAMPLE OF 100K

In [None]:
import polars as pl

# Load your big dataset (already has "category" column)

# Desired sample size
sample_size = 100_000  

# Compute sampling fraction per category
fractions = sample_size / len(big_boy)

# Stratified sample
sample4model = big_boy.groupby("category", group_keys=False).apply(
    lambda x: x.sample(frac=fractions, random_state=42)
)

# If rounding causes slight mismatch, fix final size
sample4model  = sample4model .sample(n=sample_size, random_state=42)

# Save or inspect
sample4model .to_csv("sample_100k_stratified.csv", index=False)
print(sample4model ["category"].value_counts(normalize=True))


In [None]:
sample = pl.read_csv("sample_100k_stratified.csv")
sample.shape

## GETTING VALIDATION SET OF 200

In [None]:
# From the big dataset (with 'category')
val_size = 200
fractions = val_size / len(df)

# Stratified sample
df_val = df.groupby("category", group_keys=False).apply(
    lambda x: x.sample(frac=fractions, random_state=123)
)

# Fix exact size (200 rows)
df_val = df_val.sample(n=val_size, random_state=123).reset_index(drop=True)

# Save to CSV for manual labeling
df_val.to_csv("validation_sample_200.csv", index=False)

print(df_val["category"].value_counts())


## WOOHOOO READY FOR MODEL