In [1]:
import re
import string
import nltk
import regex as re 
import pandas as pd 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

In [2]:
file_path = r"C:\Users\Sujal Karmakar\Desktop\Desktop\Data Analyst\Python\python_data_analytics_project\Theme Finder Using Caption (NLP)\Data\leanbeast_analysis_ready.csv"

df = pd.read_csv(file_path)

#### 1) Basic Cleaning (lower case, remover numbers, emoji, etc)

In [3]:
df_caption = df["caption_text"]

In [4]:
# Function to clean each caption: lowercase, remove punctuation, then remove emojis
def clean_caption(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # Lowercase

    text = re.sub(r"\p{P}+", "", text)

    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001f600-\U0001f64f"  # emoticons
        "\U0001f300-\U0001f5ff"  # symbols & pictographs
        "\U0001f680-\U0001f6ff"  # transport & map symbols
        "\U0001f1e0-\U0001f1ff"  # flags (iOS)
        "\U00002700-\U000027bf"  # Dingbats
        "\U0001f900-\U0001f9ff"  # Supplemental Symbols and Pictographs
        "\U00002600-\U000026ff"  # Misc symbols
        "\U00002b00-\U00002bff"  # Misc symbols and arrows
        "\U0001fa70-\U0001faff"  # Symbols and Pictographs Extended-A
        "\ufe0f"  # Variation Selector-16 (emoji style)
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)
    return text


# Apply to the caption column
df_caption_cleaned = df_caption.apply(clean_caption)

In [5]:
df_caption_cleaned.sample(15)

947                                        iamsrk ka diet
309                                    mere ghar ka khana
1032    vacation look in   7 days 7 different outfits ...
299     comment snacks and i will send you 5 healthy s...
349                                  best fatloss machine
250     taking an ice bath cold water immersion offers...
1031    gym membership if you agree do comment like an...
1024                                          hair hairs 
1440    manish bhai lost 13 kgs in 3 months and is now...
1073    well done darshilsangani my client from canada...
351                                                raisin
1013                           hydration is imp stay safe
407     comment berry  to get original blueberries buy...
1182    if you are skinny fat do this  abs fitness gym...
1077                     fake supplements exposed zeenews
Name: caption_text, dtype: object

#### 2) Tokenization (removing stop words)

In [6]:
# import nltk

# # Define a custom download directory
# custom_nltk_path = r'C:\Users\Sujal Karmakar\.conda\envs\DS\nltk_data'

# # Download resources to the custom path
## nltk.download('punkt', download_dir=custom_nltk_path)
# nltk.download('punkt_tab', download_dir=custom_nltk_path)
# nltk.download('stopwords', download_dir=custom_nltk_path)

# # Manually add the path to NLTK's search locations
# nltk.data.path.append(custom_nltk_path)

# i had to explicitly download these by specifying files and also punkt was not working for me nltk was demanding punkt_tab so i did that 


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Set of English stopwords for filtering
stop_words = set(stopwords.words("english"))


def preprocess_caption(text):
    # Handle non-string inputs safely
    if not isinstance(text, str):
        return []

    # Step 1: Sentence tokenize the text
    sentences = sent_tokenize(text)

    # Step 2: Word tokenize each sentence
    tokens = [word_tokenize(sentence) for sentence in sentences]

    # Step 3: Flatten the list of token lists into a single list
    flat_tokens = [token for sublist in tokens for token in sublist]

    # Step 4: Remove stopwords (case-insensitive)
    filtered_tokens = [w for w in flat_tokens if w.lower() not in stop_words]

    return filtered_tokens


df_caption_cleaned_tokens = df_caption_cleaned.apply(preprocess_caption)

# Show first few results
print(df_caption_cleaned_tokens.head())

0    [well, done, jaibajaj0786, lost, 39kgs, withou...
1    [transformation, alert, keen, interested, paid...
2    [shredded, mode, manavkansagra1111, guidance, ...
3    [comment, atta, send, link, healthy, atta, mes...
4    [dinner, date, couple, love, couplegoals, wedd...
Name: caption_text, dtype: object


In [8]:
df_caption_combined = pd.DataFrame({
    "cleaned_caption": df_caption_cleaned,
    "tokens": df_caption_cleaned_tokens
})
df_caption_cleaned

0       well done jaibajaj0786 lost 39kgs without coun...
1       transformation alert   if he keen do it  why n...
2       shredded mode on manavkansagra1111 under my gu...
3       comment atta and i will send you the link of h...
4       dinner date   couple love couplegoals wedding ...
                              ...                        
1546    lockdown transformation lost 15 kgs in 70 days...
1547    shoot mode  modelifefitness gym workout motiva...
1548    manage your life  office and work  paid online...
1549    throwback to one my client who reduced 38 kgs ...
1550    taj mahal is beautiful today gymlife training ...
Name: caption_text, Length: 1551, dtype: object

#### 3) Creating dictionaries and corpus for LDA (Linear discriminant analysis)

In [9]:
import gensim
from gensim import corpora

# Convert the Series of lists into a list of lists
texts = df_caption_cleaned_tokens.tolist()

# Create Dictionary: maps each word to a unique id
dictionary = corpora.Dictionary(texts)

# Filter extremes to remove very rare and very common words (optional but recommended)
dictionary.filter_extremes(no_below=5, no_above=0.5)  # keeps words in at least 5 docs but less than 50% of all docs

# Create Corpus: list of bags of words for each document
corpus = [dictionary.doc2bow(text) for text in texts]

# Print some info to confirm
print(f"Number of unique tokens: {len(dictionary)}")
print(f"Number of documents: {len(corpus)}")
print(f"Sample doc (bow format): {corpus[0]}")


Number of unique tokens: 677
Number of documents: 1551
Sample doc (bow format): [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
